mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 06:03:15 +08:00
Merge pull request #328 from jet47:new-gpu-fixes
This commit is contained in:
commit
11dfceb2c9
@ -110,14 +110,15 @@ endif()
|
||||
|
||||
# Optional 3rd party components
|
||||
# ===================================================
|
||||
OCV_OPTION(WITH_1394 "Include IEEE1394 support" ON IF (UNIX AND NOT ANDROID AND NOT IOS) )
|
||||
OCV_OPTION(WITH_1394 "Include IEEE1394 support" ON IF (UNIX AND NOT ANDROID AND NOT IOS AND NOT CARMA) )
|
||||
OCV_OPTION(WITH_AVFOUNDATION "Use AVFoundation for Video I/O" ON IF IOS)
|
||||
OCV_OPTION(WITH_CARBON "Use Carbon for UI instead of Cocoa" OFF IF APPLE )
|
||||
OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
|
||||
OCV_OPTION(WITH_CUDA "Include NVidia Cuda Runtime support" ON IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
|
||||
OCV_OPTION(WITH_CUFFT "Include NVidia Cuda Fast Fourier Transform (FFT) library support" ON IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
|
||||
OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
|
||||
OCV_OPTION(WITH_NVCUVID "Include NVidia Video Decoding library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS AND NOT APPLE) )
|
||||
OCV_OPTION(WITH_EIGEN "Include Eigen2/Eigen3 support" ON)
|
||||
OCV_OPTION(WITH_FFMPEG "Include FFMPEG support" ON IF (NOT ANDROID AND NOT IOS) )
|
||||
OCV_OPTION(WITH_FFMPEG "Include FFMPEG support" ON IF (NOT ANDROID AND NOT IOS))
|
||||
OCV_OPTION(WITH_GSTREAMER "Include Gstreamer support" ON IF (UNIX AND NOT APPLE AND NOT ANDROID) )
|
||||
OCV_OPTION(WITH_GTK "Include GTK support" ON IF (UNIX AND NOT APPLE AND NOT ANDROID) )
|
||||
OCV_OPTION(WITH_IMAGEIO "ImageIO support for OS X" OFF IF APPLE)
|
||||
@ -140,9 +141,9 @@ OCV_OPTION(WITH_V4L "Include Video 4 Linux support" ON
|
||||
OCV_OPTION(WITH_VIDEOINPUT "Build HighGUI with DirectShow support" ON IF WIN32 )
|
||||
OCV_OPTION(WITH_XIMEA "Include XIMEA cameras support" OFF IF (NOT ANDROID AND NOT APPLE) )
|
||||
OCV_OPTION(WITH_XINE "Include Xine support (GPL)" OFF IF (UNIX AND NOT APPLE AND NOT ANDROID) )
|
||||
OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" OFF IF (NOT ANDROID AND NOT IOS) )
|
||||
OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" OFF IF (NOT ANDROID AND NOT IOS) )
|
||||
OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" OFF IF (NOT ANDROID AND NOT IOS) )
|
||||
OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" OFF IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
|
||||
OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" OFF IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
|
||||
OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" OFF IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
|
||||
|
||||
|
||||
# OpenCV build components
|
||||
@ -161,12 +162,12 @@ OCV_OPTION(BUILD_ANDROID_SERVICE "Build OpenCV Manager for Google Play" OFF I
|
||||
OCV_OPTION(BUILD_ANDROID_PACKAGE "Build platform-specific package for Google Play" OFF IF ANDROID )
|
||||
|
||||
# 3rd party libs
|
||||
OCV_OPTION(BUILD_ZLIB "Build zlib from source" WIN32 OR APPLE )
|
||||
OCV_OPTION(BUILD_TIFF "Build libtiff from source" WIN32 OR ANDROID OR APPLE )
|
||||
OCV_OPTION(BUILD_JASPER "Build libjasper from source" WIN32 OR ANDROID OR APPLE )
|
||||
OCV_OPTION(BUILD_JPEG "Build libjpeg from source" WIN32 OR ANDROID OR APPLE )
|
||||
OCV_OPTION(BUILD_PNG "Build libpng from source" WIN32 OR ANDROID OR APPLE )
|
||||
OCV_OPTION(BUILD_OPENEXR "Build openexr from source" WIN32 OR ANDROID OR APPLE )
|
||||
OCV_OPTION(BUILD_ZLIB "Build zlib from source" WIN32 OR APPLE OR CARMA )
|
||||
OCV_OPTION(BUILD_TIFF "Build libtiff from source" WIN32 OR ANDROID OR APPLE OR CARMA )
|
||||
OCV_OPTION(BUILD_JASPER "Build libjasper from source" WIN32 OR ANDROID OR APPLE OR CARMA )
|
||||
OCV_OPTION(BUILD_JPEG "Build libjpeg from source" WIN32 OR ANDROID OR APPLE OR CARMA )
|
||||
OCV_OPTION(BUILD_PNG "Build libpng from source" WIN32 OR ANDROID OR APPLE OR CARMA )
|
||||
OCV_OPTION(BUILD_OPENEXR "Build openexr from source" WIN32 OR ANDROID OR APPLE OR CARMA )
|
||||
|
||||
|
||||
# OpenCV installation options
|
||||
@ -776,8 +777,9 @@ if(HAVE_CUDA)
|
||||
status("")
|
||||
status(" NVIDIA CUDA")
|
||||
|
||||
status(" Use CUFFT:" HAVE_CUFFT THEN YES ELSE NO)
|
||||
status(" Use CUBLAS:" HAVE_CUBLAS THEN YES ELSE NO)
|
||||
status(" Use CUFFT:" HAVE_CUFFT THEN YES ELSE NO)
|
||||
status(" Use CUBLAS:" HAVE_CUBLAS THEN YES ELSE NO)
|
||||
status(" USE NVCUVID:" HAVE_NVCUVID THEN YES ELSE NO)
|
||||
status(" NVIDIA GPU arch:" ${OPENCV_CUDA_ARCH_BIN})
|
||||
status(" NVIDIA PTX archs:" ${OPENCV_CUDA_ARCH_PTX})
|
||||
status(" Use fast math:" CUDA_FAST_MATH THEN YES ELSE NO)
|
||||
|
@ -3,17 +3,17 @@ if(${CMAKE_VERSION} VERSION_LESS "2.8.3")
|
||||
return()
|
||||
endif()
|
||||
|
||||
if (WIN32 AND NOT MSVC)
|
||||
message(STATUS "CUDA compilation is disabled (due to only Visual Studio compiler suppoted on your platform).")
|
||||
if(WIN32 AND NOT MSVC)
|
||||
message(STATUS "CUDA compilation is disabled (due to only Visual Studio compiler supported on your platform).")
|
||||
return()
|
||||
endif()
|
||||
|
||||
if (CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
message(STATUS "CUDA compilation is disabled (due to Clang unsuppoted on your platform).")
|
||||
if(CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
message(STATUS "CUDA compilation is disabled (due to Clang unsupported on your platform).")
|
||||
return()
|
||||
endif()
|
||||
|
||||
find_package(CUDA 4.1)
|
||||
find_package(CUDA 4.2 QUIET)
|
||||
|
||||
if(CUDA_FOUND)
|
||||
set(HAVE_CUDA 1)
|
||||
@ -26,15 +26,20 @@ if(CUDA_FOUND)
|
||||
set(HAVE_CUBLAS 1)
|
||||
endif()
|
||||
|
||||
message(STATUS "CUDA detected: " ${CUDA_VERSION})
|
||||
|
||||
if(${CUDA_VERSION_STRING} VERSION_GREATER "4.1")
|
||||
set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
|
||||
else()
|
||||
set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
|
||||
if(WITH_NVCUVID)
|
||||
find_cuda_helper_libs(nvcuvid)
|
||||
set(HAVE_NVCUVID 1)
|
||||
endif()
|
||||
|
||||
set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
|
||||
message(STATUS "CUDA detected: " ${CUDA_VERSION})
|
||||
|
||||
if (CARMA)
|
||||
set(CUDA_ARCH_BIN "2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
|
||||
set(CUDA_ARCH_PTX "3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
|
||||
else()
|
||||
set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
|
||||
set(CUDA_ARCH_PTX "2.0 3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
|
||||
endif()
|
||||
|
||||
string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
|
||||
string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}")
|
||||
@ -72,11 +77,20 @@ if(CUDA_FOUND)
|
||||
|
||||
# Tell NVCC to add PTX intermediate code for the specified architectures
|
||||
string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_PTX_NO_POINTS}")
|
||||
foreach(ARCH IN LISTS ARCH_LIST)
|
||||
set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=compute_${ARCH})
|
||||
set(OPENCV_CUDA_ARCH_PTX "${OPENCV_CUDA_ARCH_PTX} ${ARCH}")
|
||||
set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
|
||||
endforeach()
|
||||
foreach(ARCH IN LISTS ARCH_LIST)
|
||||
set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=compute_${ARCH})
|
||||
set(OPENCV_CUDA_ARCH_PTX "${OPENCV_CUDA_ARCH_PTX} ${ARCH}")
|
||||
set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
|
||||
endforeach()
|
||||
|
||||
if(CARMA)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --target-cpu-architecture=ARM" )
|
||||
|
||||
if (CMAKE_VERSION VERSION_LESS 2.8.10)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -ccbin=${CMAKE_CXX_COMPILER}" )
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
||||
# These vars will be processed in other scripts
|
||||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA})
|
||||
@ -84,7 +98,7 @@ if(CUDA_FOUND)
|
||||
|
||||
message(STATUS "CUDA NVCC target flags: ${CUDA_NVCC_FLAGS}")
|
||||
|
||||
OCV_OPTION(CUDA_FAST_MATH "Enable --use_fast_math for CUDA compiler " OFF)
|
||||
OCV_OPTION(CUDA_FAST_MATH "Enable --use_fast_math for CUDA compiler " OFF)
|
||||
|
||||
if(CUDA_FAST_MATH)
|
||||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math)
|
||||
@ -92,7 +106,6 @@ if(CUDA_FOUND)
|
||||
|
||||
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD CUDA_SDK_ROOT_DIR)
|
||||
|
||||
unset(CUDA_npp_LIBRARY CACHE)
|
||||
find_cuda_helper_libs(npp)
|
||||
|
||||
macro(ocv_cuda_compile VAR)
|
||||
@ -106,15 +119,15 @@ if(CUDA_FOUND)
|
||||
string(REPLACE "-ggdb3" "" ${var} "${${var}}")
|
||||
endforeach()
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
if(BUILD_SHARED_LIBS)
|
||||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -DCVAPI_EXPORTS)
|
||||
endif()
|
||||
|
||||
if(UNIX OR APPLE)
|
||||
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC)
|
||||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC)
|
||||
endif()
|
||||
if(APPLE)
|
||||
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
|
||||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
|
||||
endif()
|
||||
|
||||
# disabled because of multiple warnings during building nvcc auto generated files
|
||||
|
@ -42,8 +42,9 @@
|
||||
set(OpenCV_COMPUTE_CAPABILITIES @OpenCV_CUDA_CC_CONFIGCMAKE@)
|
||||
|
||||
set(OpenCV_CUDA_VERSION @OpenCV_CUDA_VERSION@)
|
||||
set(OpenCV_USE_CUBLAS @HAVE_CUBLAS@)
|
||||
set(OpenCV_USE_CUFFT @HAVE_CUFFT@)
|
||||
set(OpenCV_USE_CUBLAS @HAVE_CUBLAS@)
|
||||
set(OpenCV_USE_CUFFT @HAVE_CUFFT@)
|
||||
set(OpenCV_USE_NVCUVID @HAVE_NVCUVID@)
|
||||
|
||||
# Android API level from which OpenCV has been compiled is remembered
|
||||
set(OpenCV_ANDROID_NATIVE_API_LEVEL @OpenCV_ANDROID_NATIVE_API_LEVEL_CONFIGCMAKE@)
|
||||
@ -218,17 +219,22 @@ foreach(__opttype OPT DBG)
|
||||
else()
|
||||
#TODO: duplicates are annoying but they should not be the problem
|
||||
endif()
|
||||
# fix hard coded paths for CUDA libraries under Windows
|
||||
if(WIN32 AND OpenCV_CUDA_VERSION AND NOT OpenCV_SHARED)
|
||||
|
||||
# CUDA
|
||||
if(OpenCV_CUDA_VERSION AND (CARMA OR (WIN32 AND NOT OpenCV_SHARED)))
|
||||
if(NOT CUDA_FOUND)
|
||||
find_package(CUDA ${OpenCV_CUDA_VERSION} EXACT REQUIRED)
|
||||
else()
|
||||
if(NOT CUDA_VERSION_STRING VERSION_EQUAL OpenCV_CUDA_VERSION)
|
||||
message(FATAL_ERROR "OpenCV static library compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
|
||||
if(WIN32)
|
||||
message(FATAL_ERROR "OpenCV static library was compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
|
||||
else()
|
||||
message(FATAL_ERROR "OpenCV library for CARMA was compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY} ${CUDA_nvcuvid_LIBRARY} ${CUDA_nvcuvenc_LIBRARY})
|
||||
list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
|
||||
|
||||
if(OpenCV_USE_CUBLAS)
|
||||
list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_CUBLAS_LIBRARIES})
|
||||
@ -238,6 +244,13 @@ foreach(__opttype OPT DBG)
|
||||
list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_CUFFT_LIBRARIES})
|
||||
endif()
|
||||
|
||||
if(OpenCV_USE_NVCUVID)
|
||||
list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nvcuvid_LIBRARIES})
|
||||
endif()
|
||||
|
||||
if(WIN32)
|
||||
list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nvcuvenc_LIBRARIES})
|
||||
endif()
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
|
@ -175,21 +175,15 @@
|
||||
/* NVidia Cuda Runtime API*/
|
||||
#cmakedefine HAVE_CUDA
|
||||
|
||||
/* OpenCL Support */
|
||||
#cmakedefine HAVE_OPENCL
|
||||
|
||||
/* AMD's OpenCL Fast Fourier Transform Library*/
|
||||
#cmakedefine HAVE_CLAMDFFT
|
||||
|
||||
/* AMD's Basic Linear Algebra Subprograms Library*/
|
||||
#cmakedefine HAVE_CLAMDBLAS
|
||||
|
||||
/* NVidia Cuda Fast Fourier Transform (FFT) API*/
|
||||
#cmakedefine HAVE_CUFFT
|
||||
|
||||
/* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/
|
||||
#cmakedefine HAVE_CUBLAS
|
||||
|
||||
/* NVidia Video Decoding API*/
|
||||
#cmakedefine HAVE_NVCUVID
|
||||
|
||||
/* Compile for 'real' NVIDIA GPU architectures */
|
||||
#define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}"
|
||||
|
||||
@ -202,6 +196,15 @@
|
||||
/* Create PTX or BIN for 1.0 compute capability */
|
||||
#cmakedefine CUDA_ARCH_BIN_OR_PTX_10
|
||||
|
||||
/* OpenCL Support */
|
||||
#cmakedefine HAVE_OPENCL
|
||||
|
||||
/* AMD's OpenCL Fast Fourier Transform Library*/
|
||||
#cmakedefine HAVE_CLAMDFFT
|
||||
|
||||
/* AMD's Basic Linear Algebra Subprograms Library*/
|
||||
#cmakedefine HAVE_CLAMDBLAS
|
||||
|
||||
/* VideoInput library */
|
||||
#cmakedefine HAVE_VIDEOINPUT
|
||||
|
||||
|
@ -10,7 +10,6 @@ if(HAVE_CUDA)
|
||||
file(GLOB lib_cuda "src/cuda/*.cu")
|
||||
ocv_cuda_compile(cuda_objs ${lib_cuda})
|
||||
|
||||
|
||||
set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
|
||||
else()
|
||||
set(lib_cuda "")
|
||||
|
@ -177,6 +177,20 @@ namespace cv
|
||||
//#undef __CV_GPU_DEPR_BEFORE__
|
||||
//#undef __CV_GPU_DEPR_AFTER__
|
||||
|
||||
namespace device
|
||||
{
|
||||
using cv::gpu::PtrSz;
|
||||
using cv::gpu::PtrStep;
|
||||
using cv::gpu::PtrStepSz;
|
||||
|
||||
using cv::gpu::PtrStepSzb;
|
||||
using cv::gpu::PtrStepSzf;
|
||||
using cv::gpu::PtrStepSzi;
|
||||
|
||||
using cv::gpu::PtrStepb;
|
||||
using cv::gpu::PtrStepf;
|
||||
using cv::gpu::PtrStepi;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -79,6 +79,8 @@ namespace cv { namespace gpu
|
||||
WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
|
||||
};
|
||||
|
||||
CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
|
||||
|
||||
// Gives information about what GPU archs this OpenCV GPU module was
|
||||
// compiled for
|
||||
class CV_EXPORTS TargetArchs
|
||||
|
@ -44,6 +44,7 @@
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/transform.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
#include "opencv2/gpu/device/type_traits.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device
|
||||
void writeScalar(const int*);
|
||||
void writeScalar(const float*);
|
||||
void writeScalar(const double*);
|
||||
void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
|
||||
void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
|
||||
}}}
|
||||
|
||||
@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device
|
||||
//////////////////////////////// ConvertTo ////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T, typename D> struct Convertor : unary_function<T, D>
|
||||
template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
|
||||
{
|
||||
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
|
||||
Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {}
|
||||
|
||||
__device__ __forceinline__ D operator()(const T& src) const
|
||||
__device__ __forceinline__ D operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return saturate_cast<D>(alpha * src + beta);
|
||||
}
|
||||
|
||||
double alpha, beta;
|
||||
S alpha, beta;
|
||||
};
|
||||
|
||||
namespace detail
|
||||
@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
|
||||
template <typename T, typename D, typename S> struct TransformFunctorTraits< Convertor<T, D, S> > : detail::ConvertTraits< Convertor<T, D, S> >
|
||||
{
|
||||
};
|
||||
|
||||
template<typename T, typename D>
|
||||
template<typename T, typename D, typename S>
|
||||
void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
|
||||
{
|
||||
cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
|
||||
cudaSafeCall( cudaSetDoubleForDevice(&beta) );
|
||||
Convertor<T, D> op(alpha, beta);
|
||||
Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
|
||||
cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
|
||||
}
|
||||
|
||||
@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);
|
||||
|
||||
static const caller_t tab[8][8] =
|
||||
static const caller_t tab[7][7] =
|
||||
{
|
||||
{cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
|
||||
cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
|
||||
|
||||
{cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
|
||||
cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
|
||||
|
||||
{cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
|
||||
cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
|
||||
|
||||
{cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
|
||||
cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
|
||||
|
||||
{cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
|
||||
cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
|
||||
|
||||
{cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
|
||||
cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
|
||||
|
||||
{cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
|
||||
cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
|
||||
|
||||
{0,0,0,0,0,0,0,0}
|
||||
{
|
||||
cvt_<uchar, uchar, float>,
|
||||
cvt_<uchar, schar, float>,
|
||||
cvt_<uchar, ushort, float>,
|
||||
cvt_<uchar, short, float>,
|
||||
cvt_<uchar, int, float>,
|
||||
cvt_<uchar, float, float>,
|
||||
cvt_<uchar, double, double>
|
||||
},
|
||||
{
|
||||
cvt_<schar, uchar, float>,
|
||||
cvt_<schar, schar, float>,
|
||||
cvt_<schar, ushort, float>,
|
||||
cvt_<schar, short, float>,
|
||||
cvt_<schar, int, float>,
|
||||
cvt_<schar, float, float>,
|
||||
cvt_<schar, double, double>
|
||||
},
|
||||
{
|
||||
cvt_<ushort, uchar, float>,
|
||||
cvt_<ushort, schar, float>,
|
||||
cvt_<ushort, ushort, float>,
|
||||
cvt_<ushort, short, float>,
|
||||
cvt_<ushort, int, float>,
|
||||
cvt_<ushort, float, float>,
|
||||
cvt_<ushort, double, double>
|
||||
},
|
||||
{
|
||||
cvt_<short, uchar, float>,
|
||||
cvt_<short, schar, float>,
|
||||
cvt_<short, ushort, float>,
|
||||
cvt_<short, short, float>,
|
||||
cvt_<short, int, float>,
|
||||
cvt_<short, float, float>,
|
||||
cvt_<short, double, double>
|
||||
},
|
||||
{
|
||||
cvt_<int, uchar, float>,
|
||||
cvt_<int, schar, float>,
|
||||
cvt_<int, ushort, float>,
|
||||
cvt_<int, short, float>,
|
||||
cvt_<int, int, double>,
|
||||
cvt_<int, float, double>,
|
||||
cvt_<int, double, double>
|
||||
},
|
||||
{
|
||||
cvt_<float, uchar, float>,
|
||||
cvt_<float, schar, float>,
|
||||
cvt_<float, ushort, float>,
|
||||
cvt_<float, short, float>,
|
||||
cvt_<float, int, float>,
|
||||
cvt_<float, float, float>,
|
||||
cvt_<float, double, double>
|
||||
},
|
||||
{
|
||||
cvt_<double, uchar, double>,
|
||||
cvt_<double, schar, double>,
|
||||
cvt_<double, ushort, double>,
|
||||
cvt_<double, short, double>,
|
||||
cvt_<double, int, double>,
|
||||
cvt_<double, float, double>,
|
||||
cvt_<double, double, double>
|
||||
}
|
||||
};
|
||||
|
||||
caller_t func = tab[sdepth][ddepth];
|
||||
if (!func)
|
||||
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu");
|
||||
|
||||
func(src, dst, alpha, beta, stream);
|
||||
}
|
||||
|
||||
|
@ -45,8 +45,7 @@
|
||||
#include <iostream>
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <npp.h>
|
||||
|
||||
#define CUDART_MINIMUM_REQUIRED_VERSION 4010
|
||||
@ -69,33 +68,89 @@ using namespace cv::gpu;
|
||||
|
||||
namespace
|
||||
{
|
||||
// Compares value to set using the given comparator. Returns true if
|
||||
// there is at least one element x in the set satisfying to: x cmp value
|
||||
// predicate.
|
||||
template <typename Comparer>
|
||||
bool compareToSet(const std::string& set_as_str, int value, Comparer cmp)
|
||||
class CudaArch
|
||||
{
|
||||
public:
|
||||
CudaArch();
|
||||
|
||||
bool builtWith(FeatureSet feature_set) const;
|
||||
bool hasPtx(int major, int minor) const;
|
||||
bool hasBin(int major, int minor) const;
|
||||
bool hasEqualOrLessPtx(int major, int minor) const;
|
||||
bool hasEqualOrGreaterPtx(int major, int minor) const;
|
||||
bool hasEqualOrGreaterBin(int major, int minor) const;
|
||||
|
||||
private:
|
||||
static void fromStr(const string& set_as_str, vector<int>& arr);
|
||||
|
||||
vector<int> bin;
|
||||
vector<int> ptx;
|
||||
vector<int> features;
|
||||
};
|
||||
|
||||
const CudaArch cudaArch;
|
||||
|
||||
CudaArch::CudaArch()
|
||||
{
|
||||
#ifdef HAVE_CUDA
|
||||
fromStr(CUDA_ARCH_BIN, bin);
|
||||
fromStr(CUDA_ARCH_PTX, ptx);
|
||||
fromStr(CUDA_ARCH_FEATURES, features);
|
||||
#endif
|
||||
}
|
||||
|
||||
bool CudaArch::builtWith(FeatureSet feature_set) const
|
||||
{
|
||||
return !features.empty() && (features.back() >= feature_set);
|
||||
}
|
||||
|
||||
bool CudaArch::hasPtx(int major, int minor) const
|
||||
{
|
||||
return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
|
||||
}
|
||||
|
||||
bool CudaArch::hasBin(int major, int minor) const
|
||||
{
|
||||
return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
|
||||
}
|
||||
|
||||
bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
|
||||
{
|
||||
return !ptx.empty() && (ptx.front() <= major * 10 + minor);
|
||||
}
|
||||
|
||||
bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
|
||||
{
|
||||
return !ptx.empty() && (ptx.back() >= major * 10 + minor);
|
||||
}
|
||||
|
||||
bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
|
||||
{
|
||||
return !bin.empty() && (bin.back() >= major * 10 + minor);
|
||||
}
|
||||
|
||||
void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
|
||||
{
|
||||
if (set_as_str.find_first_not_of(" ") == string::npos)
|
||||
return false;
|
||||
return;
|
||||
|
||||
std::stringstream stream(set_as_str);
|
||||
istringstream stream(set_as_str);
|
||||
int cur_value;
|
||||
|
||||
while (!stream.eof())
|
||||
{
|
||||
stream >> cur_value;
|
||||
if (cmp(cur_value, value))
|
||||
return true;
|
||||
arr.push_back(cur_value);
|
||||
}
|
||||
|
||||
return false;
|
||||
sort(arr.begin(), arr.end());
|
||||
}
|
||||
}
|
||||
|
||||
bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
|
||||
{
|
||||
#if defined (HAVE_CUDA)
|
||||
return ::compareToSet(CUDA_ARCH_FEATURES, feature_set, std::greater_equal<int>());
|
||||
return cudaArch.builtWith(feature_set);
|
||||
#else
|
||||
(void)feature_set;
|
||||
return false;
|
||||
@ -110,7 +165,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor)
|
||||
bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
|
||||
{
|
||||
#if defined (HAVE_CUDA)
|
||||
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::equal_to<int>());
|
||||
return cudaArch.hasPtx(major, minor);
|
||||
#else
|
||||
(void)major;
|
||||
(void)minor;
|
||||
@ -121,7 +176,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
|
||||
bool cv::gpu::TargetArchs::hasBin(int major, int minor)
|
||||
{
|
||||
#if defined (HAVE_CUDA)
|
||||
return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, std::equal_to<int>());
|
||||
return cudaArch.hasBin(major, minor);
|
||||
#else
|
||||
(void)major;
|
||||
(void)minor;
|
||||
@ -132,8 +187,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor)
|
||||
bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
|
||||
{
|
||||
#if defined (HAVE_CUDA)
|
||||
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor,
|
||||
std::less_equal<int>());
|
||||
return cudaArch.hasEqualOrLessPtx(major, minor);
|
||||
#else
|
||||
(void)major;
|
||||
(void)minor;
|
||||
@ -143,14 +197,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
|
||||
|
||||
bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
|
||||
{
|
||||
return hasEqualOrGreaterPtx(major, minor) ||
|
||||
hasEqualOrGreaterBin(major, minor);
|
||||
return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
|
||||
}
|
||||
|
||||
bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
|
||||
{
|
||||
#if defined (HAVE_CUDA)
|
||||
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::greater_equal<int>());
|
||||
return cudaArch.hasEqualOrGreaterPtx(major, minor);
|
||||
#else
|
||||
(void)major;
|
||||
(void)minor;
|
||||
@ -161,8 +214,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
|
||||
bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
|
||||
{
|
||||
#if defined (HAVE_CUDA)
|
||||
return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor,
|
||||
std::greater_equal<int>());
|
||||
return cudaArch.hasEqualOrGreaterBin(major, minor);
|
||||
#else
|
||||
(void)major;
|
||||
(void)minor;
|
||||
@ -170,6 +222,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
|
||||
#endif
|
||||
}
|
||||
|
||||
bool cv::gpu::deviceSupports(FeatureSet feature_set)
|
||||
{
|
||||
static int versions[] =
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
|
||||
};
|
||||
static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
|
||||
|
||||
const int devId = getDevice();
|
||||
|
||||
int version;
|
||||
|
||||
if (devId < cache_size && versions[devId] >= 0)
|
||||
version = versions[devId];
|
||||
else
|
||||
{
|
||||
DeviceInfo dev(devId);
|
||||
version = dev.majorVersion() * 10 + dev.minorVersion();
|
||||
if (devId < cache_size)
|
||||
versions[devId] = version;
|
||||
}
|
||||
|
||||
return TargetArchs::builtWith(feature_set) && (version >= feature_set);
|
||||
}
|
||||
|
||||
#if !defined (HAVE_CUDA)
|
||||
|
||||
#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
|
||||
@ -315,18 +392,6 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory)
|
||||
|
||||
namespace
|
||||
{
|
||||
template <class T> void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
|
||||
{
|
||||
*attribute = T();
|
||||
//CUresult error = CUDA_SUCCESS;// = cuDeviceGetAttribute( attribute, device_attribute, device ); why link erros under ubuntu??
|
||||
CUresult error = cuDeviceGetAttribute( attribute, device_attribute, device );
|
||||
if( CUDA_SUCCESS == error )
|
||||
return;
|
||||
|
||||
printf("Driver API error = %04d\n", error);
|
||||
cv::gpu::error("driver API error", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
int convertSMVer2Cores(int major, int minor)
|
||||
{
|
||||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||
@ -335,7 +400,7 @@ namespace
|
||||
int Cores;
|
||||
} SMtoCores;
|
||||
|
||||
SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, { -1, -1 } };
|
||||
SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } };
|
||||
|
||||
int index = 0;
|
||||
while (gpuArchCoresPerSM[index].SM != -1)
|
||||
@ -344,7 +409,7 @@ namespace
|
||||
return gpuArchCoresPerSM[index].Cores;
|
||||
index++;
|
||||
}
|
||||
printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
|
||||
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
@ -382,22 +447,13 @@ void cv::gpu::printCudaDeviceInfo(int device)
|
||||
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
|
||||
printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor);
|
||||
printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
|
||||
printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n",
|
||||
prop.multiProcessorCount, convertSMVer2Cores(prop.major, prop.minor),
|
||||
convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);
|
||||
|
||||
int cores = convertSMVer2Cores(prop.major, prop.minor);
|
||||
if (cores > 0)
|
||||
printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
|
||||
|
||||
printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f);
|
||||
|
||||
// This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
|
||||
int memoryClock, memBusWidth, L2CacheSize;
|
||||
getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );
|
||||
getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev );
|
||||
getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev );
|
||||
|
||||
printf(" Memory Clock rate: %.2f Mhz\n", memoryClock * 1e-3f);
|
||||
printf(" Memory Bus Width: %d-bit\n", memBusWidth);
|
||||
if (L2CacheSize)
|
||||
printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
|
||||
|
||||
printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
|
||||
prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
|
||||
prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
|
||||
@ -457,7 +513,12 @@ void cv::gpu::printShortCudaDeviceInfo(int device)
|
||||
|
||||
const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
|
||||
printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
|
||||
printf(", sm_%d%d%s, %d cores", prop.major, prop.minor, arch_str, convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);
|
||||
printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
|
||||
|
||||
int cores = convertSMVer2Cores(prop.major, prop.minor);
|
||||
if (cores > 0)
|
||||
printf(", %d cores", cores * prop.multiProcessorCount);
|
||||
|
||||
printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
|
||||
}
|
||||
fflush(stdout);
|
||||
|
@ -5,7 +5,7 @@ endif()
|
||||
set(the_description "GPU-accelerated Computer Vision")
|
||||
ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree opencv_photo opencv_legacy)
|
||||
|
||||
ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda" "${CMAKE_CURRENT_SOURCE_DIR}/../highgui/src")
|
||||
ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")
|
||||
|
||||
file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
|
||||
file(GLOB lib_device_hdrs "include/opencv2/${name}/device/*.hpp" "include/opencv2/${name}/device/*.h")
|
||||
@ -15,24 +15,21 @@ file(GLOB lib_cuda_hdrs "src/cuda/*.hpp" "src/cuda/*.h")
|
||||
file(GLOB lib_srcs "src/*.cpp")
|
||||
file(GLOB lib_cuda "src/cuda/*.cu*")
|
||||
|
||||
source_group("Include" FILES ${lib_hdrs})
|
||||
source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs})
|
||||
source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs})
|
||||
source_group("Device" FILES ${lib_device_hdrs})
|
||||
source_group("Include" FILES ${lib_hdrs})
|
||||
source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs})
|
||||
source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs})
|
||||
source_group("Device" FILES ${lib_device_hdrs})
|
||||
source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})
|
||||
|
||||
if (HAVE_CUDA)
|
||||
file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")
|
||||
file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp" "src/nvidia/*.h*")
|
||||
file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
|
||||
file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h")
|
||||
set(ncv_files ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
|
||||
set(ncv_files ${ncv_srcs} ${ncv_cuda})
|
||||
|
||||
source_group("Src\\NVidia" FILES ${ncv_files})
|
||||
ocv_include_directories("src/nvidia" "src/nvidia/core" "src/nvidia/NPP_staging" ${CUDA_INCLUDE_DIRS})
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations /wd4211 /wd4201 /wd4100 /wd4505 /wd4408)
|
||||
string(REPLACE "-Wsign-promo" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||
|
||||
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
|
||||
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
|
||||
|
||||
if(MSVC)
|
||||
@ -47,23 +44,18 @@ if (HAVE_CUDA)
|
||||
|
||||
ocv_cuda_compile(cuda_objs ${lib_cuda} ${ncv_cuda})
|
||||
|
||||
#CUDA_BUILD_CLEAN_TARGET()
|
||||
|
||||
set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
|
||||
|
||||
if(NOT APPLE)
|
||||
unset(CUDA_nvcuvid_LIBRARY CACHE)
|
||||
find_cuda_helper_libs(nvcuvid)
|
||||
if(WITH_NVCUVID)
|
||||
set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY})
|
||||
endif()
|
||||
|
||||
if(WIN32)
|
||||
unset(CUDA_nvcuvenc_LIBRARY CACHE)
|
||||
find_cuda_helper_libs(nvcuvenc)
|
||||
set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvenc_LIBRARY})
|
||||
endif()
|
||||
|
||||
if(NOT APPLE AND WITH_FFMPEG)
|
||||
if(WITH_FFMPEG)
|
||||
set(cuda_link_libs ${cuda_link_libs} ${HIGHGUI_LIBRARIES})
|
||||
endif()
|
||||
else()
|
||||
|
@ -216,6 +216,86 @@ namespace cv { namespace gpu { namespace device
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab, 3, 3, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab, 4, 3, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab4, 3, 4, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab4, 4, 4, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab, 3, 3, true, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab, 4, 3, true, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab4, 3, 4, true, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab4, 4, 4, true, 0)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab, 3, 3, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab, 4, 3, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab4, 3, 4, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab4, 4, 4, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab, 3, 3, false, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab, 4, 3, false, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab4, 3, 4, false, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab4, 4, 4, false, 0)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgb, 3, 3, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgb, 4, 3, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgba, 3, 4, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgba, 4, 4, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgr, 3, 3, true, 0)
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgr, 4, 3, true, 0)
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgra, 3, 4, true, 0)
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgra, 4, 4, true, 0)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgb, 3, 3, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgb, 4, 3, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgba, 3, 4, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgba, 4, 4, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgr, 3, 3, false, 0)
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgr, 4, 3, false, 0)
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgra, 3, 4, false, 0)
|
||||
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgra, 4, 4, false, 0)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv, 3, 3, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv, 4, 3, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv4, 3, 4, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv4, 4, 4, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv, 3, 3, true, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv, 4, 3, true, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv4, 3, 4, true, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv4, 4, 4, true, 0)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv, 3, 3, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv, 4, 3, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv4, 3, 4, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv4, 4, 4, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv, 3, 3, false, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv, 4, 3, false, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv4, 3, 4, false, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv4, 4, 4, false, 0)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgb, 3, 3, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgb, 4, 3, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgba, 3, 4, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgba, 4, 4, true, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgr, 3, 3, true, 0)
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgr, 4, 3, true, 0)
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgra, 3, 4, true, 0)
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgra, 4, 4, true, 0)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgb, 3, 3, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgb, 4, 3, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgba, 3, 4, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgba, 4, 4, false, 2)
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgr, 3, 3, false, 0)
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgr, 4, 3, false, 0)
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgra, 3, 4, false, 0)
|
||||
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgra, 4, 4, false, 0)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
|
||||
|
@ -85,8 +85,6 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int
|
||||
cv::gpu::error(cudaGetErrorString(err), file, line, func);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
|
||||
namespace cv { namespace gpu
|
||||
{
|
||||
__host__ __device__ __forceinline__ int divUp(int total, int grain)
|
||||
@ -96,19 +94,25 @@ namespace cv { namespace gpu
|
||||
|
||||
namespace device
|
||||
{
|
||||
using cv::gpu::divUp;
|
||||
|
||||
#ifdef __CUDACC__
|
||||
typedef unsigned char uchar;
|
||||
typedef unsigned short ushort;
|
||||
typedef signed char schar;
|
||||
typedef unsigned int uint;
|
||||
#ifdef _WIN32
|
||||
typedef unsigned int uint;
|
||||
#endif
|
||||
|
||||
template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
|
||||
{
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
|
||||
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
|
||||
}
|
||||
#endif // __CUDACC__
|
||||
}
|
||||
}}
|
||||
|
||||
#endif // __CUDACC__
|
||||
|
||||
|
||||
#endif // __OPENCV_GPU_COMMON_HPP__
|
||||
|
File diff suppressed because one or more lines are too long
361
modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
Normal file
361
modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
Normal file
@ -0,0 +1,361 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__
|
||||
#define __OPENCV_GPU_REDUCE_DETAIL_HPP__
|
||||
|
||||
#include <thrust/tuple.h>
|
||||
#include "../warp.hpp"
|
||||
#include "../warp_shuffle.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace reduce_detail
|
||||
{
|
||||
template <typename T> struct GetType;
|
||||
template <typename T> struct GetType<T*>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <typename T> struct GetType<volatile T*>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <typename T> struct GetType<T&>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
template <unsigned int I, unsigned int N>
|
||||
struct For
|
||||
{
|
||||
template <class PointerTuple, class ValTuple>
|
||||
static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
|
||||
{
|
||||
thrust::get<I>(smem)[tid] = thrust::get<I>(val);
|
||||
|
||||
For<I + 1, N>::loadToSmem(smem, val, tid);
|
||||
}
|
||||
template <class PointerTuple, class ValTuple>
|
||||
static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
|
||||
{
|
||||
thrust::get<I>(val) = thrust::get<I>(smem)[tid];
|
||||
|
||||
For<I + 1, N>::loadFromSmem(smem, val, tid);
|
||||
}
|
||||
|
||||
template <class PointerTuple, class ValTuple, class OpTuple>
|
||||
static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
|
||||
{
|
||||
typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
|
||||
thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
|
||||
|
||||
For<I + 1, N>::merge(smem, val, tid, delta, op);
|
||||
}
|
||||
template <class ValTuple, class OpTuple>
|
||||
static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
|
||||
{
|
||||
typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
|
||||
thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
|
||||
|
||||
For<I + 1, N>::mergeShfl(val, delta, width, op);
|
||||
}
|
||||
};
|
||||
template <unsigned int N>
|
||||
struct For<N, N>
|
||||
{
|
||||
template <class PointerTuple, class ValTuple>
|
||||
static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
|
||||
{
|
||||
}
|
||||
template <class PointerTuple, class ValTuple>
|
||||
static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
|
||||
{
|
||||
}
|
||||
|
||||
template <class PointerTuple, class ValTuple, class OpTuple>
|
||||
static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
|
||||
{
|
||||
}
|
||||
template <class ValTuple, class OpTuple>
|
||||
static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
|
||||
{
|
||||
smem[tid] = val;
|
||||
}
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
|
||||
{
|
||||
val = smem[tid];
|
||||
}
|
||||
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
|
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
|
||||
__device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
|
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
|
||||
unsigned int tid)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
|
||||
}
|
||||
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
|
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
|
||||
__device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
|
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
|
||||
unsigned int tid)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
|
||||
}
|
||||
|
||||
template <typename T, class Op>
|
||||
__device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
|
||||
{
|
||||
T reg = smem[tid + delta];
|
||||
smem[tid] = val = op(val, reg);
|
||||
}
|
||||
template <typename T, class Op>
|
||||
__device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
|
||||
{
|
||||
T reg = shfl_down(val, delta, width);
|
||||
val = op(val, reg);
|
||||
}
|
||||
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
|
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
|
||||
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
|
||||
__device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
|
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
|
||||
unsigned int tid,
|
||||
unsigned int delta,
|
||||
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
|
||||
}
|
||||
template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
|
||||
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
|
||||
__device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
|
||||
unsigned int delta,
|
||||
unsigned int width,
|
||||
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
|
||||
}
|
||||
|
||||
template <unsigned int N> struct Generic
|
||||
{
|
||||
template <typename Pointer, typename Reference, class Op>
|
||||
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
|
||||
{
|
||||
loadToSmem(smem, val, tid);
|
||||
if (N >= 32)
|
||||
__syncthreads();
|
||||
|
||||
if (N >= 2048)
|
||||
{
|
||||
if (tid < 1024)
|
||||
merge(smem, val, tid, 1024, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 1024)
|
||||
{
|
||||
if (tid < 512)
|
||||
merge(smem, val, tid, 512, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 512)
|
||||
{
|
||||
if (tid < 256)
|
||||
merge(smem, val, tid, 256, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 256)
|
||||
{
|
||||
if (tid < 128)
|
||||
merge(smem, val, tid, 128, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 128)
|
||||
{
|
||||
if (tid < 64)
|
||||
merge(smem, val, tid, 64, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 64)
|
||||
{
|
||||
if (tid < 32)
|
||||
merge(smem, val, tid, 32, op);
|
||||
}
|
||||
|
||||
if (tid < 16)
|
||||
{
|
||||
merge(smem, val, tid, 16, op);
|
||||
merge(smem, val, tid, 8, op);
|
||||
merge(smem, val, tid, 4, op);
|
||||
merge(smem, val, tid, 2, op);
|
||||
merge(smem, val, tid, 1, op);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int I, typename Pointer, typename Reference, class Op>
|
||||
struct Unroll
|
||||
{
|
||||
static __device__ void loopShfl(Reference val, Op op, unsigned int N)
|
||||
{
|
||||
mergeShfl(val, I, N, op);
|
||||
Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
|
||||
}
|
||||
static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
|
||||
{
|
||||
merge(smem, val, tid, I, op);
|
||||
Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
|
||||
}
|
||||
};
|
||||
template <typename Pointer, typename Reference, class Op>
|
||||
struct Unroll<0, Pointer, Reference, Op>
|
||||
{
|
||||
static __device__ void loopShfl(Reference, Op, unsigned int)
|
||||
{
|
||||
}
|
||||
static __device__ void loop(Pointer, Reference, unsigned int, Op)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int N> struct WarpOptimized
|
||||
{
|
||||
template <typename Pointer, typename Reference, class Op>
|
||||
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
(void) smem;
|
||||
(void) tid;
|
||||
|
||||
Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
|
||||
#else
|
||||
loadToSmem(smem, val, tid);
|
||||
|
||||
if (tid < N / 2)
|
||||
Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int N> struct GenericOptimized32
|
||||
{
|
||||
enum { M = N / 32 };
|
||||
|
||||
template <typename Pointer, typename Reference, class Op>
|
||||
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
|
||||
{
|
||||
const unsigned int laneId = Warp::laneId();
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
|
||||
|
||||
if (laneId == 0)
|
||||
loadToSmem(smem, val, tid / 32);
|
||||
#else
|
||||
loadToSmem(smem, val, tid);
|
||||
|
||||
if (laneId < 16)
|
||||
Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (laneId == 0)
|
||||
loadToSmem(smem, val, tid / 32);
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
|
||||
loadFromSmem(smem, val, tid);
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
|
||||
#else
|
||||
Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <bool val, class T1, class T2> struct StaticIf;
|
||||
template <class T1, class T2> struct StaticIf<true, T1, T2>
|
||||
{
|
||||
typedef T1 type;
|
||||
};
|
||||
template <class T1, class T2> struct StaticIf<false, T1, T2>
|
||||
{
|
||||
typedef T2 type;
|
||||
};
|
||||
|
||||
template <unsigned int N> struct IsPowerOf2
|
||||
{
|
||||
enum { value = ((N != 0) && !(N & (N - 1))) };
|
||||
};
|
||||
|
||||
template <unsigned int N> struct Dispatcher
|
||||
{
|
||||
typedef typename StaticIf<
|
||||
(N <= 32) && IsPowerOf2<N>::value,
|
||||
WarpOptimized<N>,
|
||||
typename StaticIf<
|
||||
(N <= 1024) && IsPowerOf2<N>::value,
|
||||
GenericOptimized32<N>,
|
||||
Generic<N>
|
||||
>::type
|
||||
>::type reductor;
|
||||
};
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__
|
498
modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
Normal file
498
modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
Normal file
@ -0,0 +1,498 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
|
||||
#define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
|
||||
|
||||
#include <thrust/tuple.h>
|
||||
#include "../warp.hpp"
|
||||
#include "../warp_shuffle.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace reduce_key_val_detail
|
||||
{
|
||||
template <typename T> struct GetType;
|
||||
template <typename T> struct GetType<T*>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <typename T> struct GetType<volatile T*>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <typename T> struct GetType<T&>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
template <unsigned int I, unsigned int N>
|
||||
struct For
|
||||
{
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
|
||||
{
|
||||
thrust::get<I>(smem)[tid] = thrust::get<I>(data);
|
||||
|
||||
For<I + 1, N>::loadToSmem(smem, data, tid);
|
||||
}
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
|
||||
{
|
||||
thrust::get<I>(data) = thrust::get<I>(smem)[tid];
|
||||
|
||||
For<I + 1, N>::loadFromSmem(smem, data, tid);
|
||||
}
|
||||
|
||||
template <class ReferenceTuple>
|
||||
static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
|
||||
{
|
||||
thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
|
||||
|
||||
For<I + 1, N>::copyShfl(val, delta, width);
|
||||
}
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
|
||||
{
|
||||
thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
|
||||
|
||||
For<I + 1, N>::copy(svals, val, tid, delta);
|
||||
}
|
||||
|
||||
template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
|
||||
static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
|
||||
{
|
||||
typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
|
||||
|
||||
if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
|
||||
{
|
||||
thrust::get<I>(key) = reg;
|
||||
thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
|
||||
}
|
||||
|
||||
For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
|
||||
}
|
||||
template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
|
||||
static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
|
||||
const ValPointerTuple& svals, const ValReferenceTuple& val,
|
||||
const CmpTuple& cmp,
|
||||
unsigned int tid, unsigned int delta)
|
||||
{
|
||||
typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
|
||||
|
||||
if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
|
||||
{
|
||||
thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
|
||||
thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
|
||||
}
|
||||
|
||||
For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
|
||||
}
|
||||
};
|
||||
template <unsigned int N>
|
||||
struct For<N, N>
|
||||
{
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
|
||||
{
|
||||
}
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
|
||||
{
|
||||
}
|
||||
|
||||
template <class ReferenceTuple>
|
||||
static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
|
||||
{
|
||||
}
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
|
||||
{
|
||||
}
|
||||
|
||||
template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
|
||||
static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
|
||||
{
|
||||
}
|
||||
template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
|
||||
static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
|
||||
const ValPointerTuple&, const ValReferenceTuple&,
|
||||
const CmpTuple&,
|
||||
unsigned int, unsigned int)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// loadToSmem
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
|
||||
{
|
||||
smem[tid] = data;
|
||||
}
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
|
||||
{
|
||||
data = smem[tid];
|
||||
}
|
||||
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
|
||||
__device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
|
||||
unsigned int tid)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
|
||||
}
|
||||
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
|
||||
__device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
|
||||
unsigned int tid)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// copyVals
|
||||
|
||||
template <typename V>
|
||||
__device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
|
||||
{
|
||||
val = shfl_down(val, delta, width);
|
||||
}
|
||||
template <typename V>
|
||||
__device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
|
||||
{
|
||||
svals[tid] = val = svals[tid + delta];
|
||||
}
|
||||
template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
|
||||
__device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
unsigned int delta,
|
||||
int width)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
|
||||
}
|
||||
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
|
||||
__device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
unsigned int tid, unsigned int delta)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// merge
|
||||
|
||||
template <typename K, typename V, class Cmp>
|
||||
__device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
|
||||
{
|
||||
K reg = shfl_down(key, delta, width);
|
||||
|
||||
if (cmp(reg, key))
|
||||
{
|
||||
key = reg;
|
||||
copyValsShfl(val, delta, width);
|
||||
}
|
||||
}
|
||||
template <typename K, typename V, class Cmp>
|
||||
__device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
|
||||
{
|
||||
K reg = skeys[tid + delta];
|
||||
|
||||
if (cmp(reg, key))
|
||||
{
|
||||
skeys[tid] = key = reg;
|
||||
copyVals(svals, val, tid, delta);
|
||||
}
|
||||
}
|
||||
template <typename K,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp>
|
||||
__device__ __forceinline__ void mergeShfl(K& key,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
const Cmp& cmp,
|
||||
unsigned int delta, int width)
|
||||
{
|
||||
K reg = shfl_down(key, delta, width);
|
||||
|
||||
if (cmp(reg, key))
|
||||
{
|
||||
key = reg;
|
||||
copyValsShfl(val, delta, width);
|
||||
}
|
||||
}
|
||||
template <typename K,
|
||||
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp>
|
||||
__device__ __forceinline__ void merge(volatile K* skeys, K& key,
|
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
const Cmp& cmp, unsigned int tid, unsigned int delta)
|
||||
{
|
||||
K reg = skeys[tid + delta];
|
||||
|
||||
if (cmp(reg, key))
|
||||
{
|
||||
skeys[tid] = key = reg;
|
||||
copyVals(svals, val, tid, delta);
|
||||
}
|
||||
}
|
||||
template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
|
||||
__device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
|
||||
unsigned int delta, int width)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
|
||||
}
|
||||
template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
|
||||
typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
|
||||
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
|
||||
__device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
|
||||
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
|
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
|
||||
unsigned int tid, unsigned int delta)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Generic
|
||||
|
||||
template <unsigned int N> struct Generic
|
||||
{
|
||||
template <class KP, class KR, class VP, class VR, class Cmp>
|
||||
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
|
||||
{
|
||||
loadToSmem(skeys, key, tid);
|
||||
loadValsToSmem(svals, val, tid);
|
||||
if (N >= 32)
|
||||
__syncthreads();
|
||||
|
||||
if (N >= 2048)
|
||||
{
|
||||
if (tid < 1024)
|
||||
merge(skeys, key, svals, val, cmp, tid, 1024);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 1024)
|
||||
{
|
||||
if (tid < 512)
|
||||
merge(skeys, key, svals, val, cmp, tid, 512);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 512)
|
||||
{
|
||||
if (tid < 256)
|
||||
merge(skeys, key, svals, val, cmp, tid, 256);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 256)
|
||||
{
|
||||
if (tid < 128)
|
||||
merge(skeys, key, svals, val, cmp, tid, 128);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 128)
|
||||
{
|
||||
if (tid < 64)
|
||||
merge(skeys, key, svals, val, cmp, tid, 64);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 64)
|
||||
{
|
||||
if (tid < 32)
|
||||
merge(skeys, key, svals, val, cmp, tid, 32);
|
||||
}
|
||||
|
||||
if (tid < 16)
|
||||
{
|
||||
merge(skeys, key, svals, val, cmp, tid, 16);
|
||||
merge(skeys, key, svals, val, cmp, tid, 8);
|
||||
merge(skeys, key, svals, val, cmp, tid, 4);
|
||||
merge(skeys, key, svals, val, cmp, tid, 2);
|
||||
merge(skeys, key, svals, val, cmp, tid, 1);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
|
||||
struct Unroll
|
||||
{
|
||||
static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
|
||||
{
|
||||
mergeShfl(key, val, cmp, I, N);
|
||||
Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
|
||||
}
|
||||
static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
|
||||
{
|
||||
merge(skeys, key, svals, val, cmp, tid, I);
|
||||
Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
|
||||
}
|
||||
};
|
||||
template <class KP, class KR, class VP, class VR, class Cmp>
|
||||
struct Unroll<0, KP, KR, VP, VR, Cmp>
|
||||
{
|
||||
static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
|
||||
{
|
||||
}
|
||||
static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int N> struct WarpOptimized
|
||||
{
|
||||
template <class KP, class KR, class VP, class VR, class Cmp>
|
||||
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
|
||||
{
|
||||
#if 0 // __CUDA_ARCH__ >= 300
|
||||
(void) skeys;
|
||||
(void) svals;
|
||||
(void) tid;
|
||||
|
||||
Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
|
||||
#else
|
||||
loadToSmem(skeys, key, tid);
|
||||
loadToSmem(svals, val, tid);
|
||||
|
||||
if (tid < N / 2)
|
||||
Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int N> struct GenericOptimized32
|
||||
{
|
||||
enum { M = N / 32 };
|
||||
|
||||
template <class KP, class KR, class VP, class VR, class Cmp>
|
||||
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
|
||||
{
|
||||
const unsigned int laneId = Warp::laneId();
|
||||
|
||||
#if 0 // __CUDA_ARCH__ >= 300
|
||||
Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
|
||||
|
||||
if (laneId == 0)
|
||||
{
|
||||
loadToSmem(skeys, key, tid / 32);
|
||||
loadToSmem(svals, val, tid / 32);
|
||||
}
|
||||
#else
|
||||
loadToSmem(skeys, key, tid);
|
||||
loadToSmem(svals, val, tid);
|
||||
|
||||
if (laneId < 16)
|
||||
Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (laneId == 0)
|
||||
{
|
||||
loadToSmem(skeys, key, tid / 32);
|
||||
loadToSmem(svals, val, tid / 32);
|
||||
}
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
|
||||
loadFromSmem(skeys, key, tid);
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
#if 0 // __CUDA_ARCH__ >= 300
|
||||
loadFromSmem(svals, val, tid);
|
||||
|
||||
Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
|
||||
#else
|
||||
Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <bool val, class T1, class T2> struct StaticIf;
|
||||
template <class T1, class T2> struct StaticIf<true, T1, T2>
|
||||
{
|
||||
typedef T1 type;
|
||||
};
|
||||
template <class T1, class T2> struct StaticIf<false, T1, T2>
|
||||
{
|
||||
typedef T2 type;
|
||||
};
|
||||
|
||||
template <unsigned int N> struct IsPowerOf2
|
||||
{
|
||||
enum { value = ((N != 0) && !(N & (N - 1))) };
|
||||
};
|
||||
|
||||
template <unsigned int N> struct Dispatcher
|
||||
{
|
||||
typedef typename StaticIf<
|
||||
(N <= 32) && IsPowerOf2<N>::value,
|
||||
WarpOptimized<N>,
|
||||
typename StaticIf<
|
||||
(N <= 1024) && IsPowerOf2<N>::value,
|
||||
GenericOptimized32<N>,
|
||||
Generic<N>
|
||||
>::type
|
||||
>::type reductor;
|
||||
};
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
|
@ -1,841 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_REDUCTION_DETAIL_HPP__
|
||||
#define __OPENCV_GPU_REDUCTION_DETAIL_HPP__
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace utility_detail
|
||||
{
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Reductor
|
||||
|
||||
template <int n> struct WarpReductor
|
||||
{
|
||||
template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
if (tid < n)
|
||||
data[tid] = partial_reduction;
|
||||
if (n > 32) __syncthreads();
|
||||
|
||||
if (n > 32)
|
||||
{
|
||||
if (tid < n - 32)
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
|
||||
if (tid < 16)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
}
|
||||
}
|
||||
else if (n > 16)
|
||||
{
|
||||
if (tid < n - 16)
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
if (tid < 8)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
}
|
||||
}
|
||||
else if (n > 8)
|
||||
{
|
||||
if (tid < n - 8)
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
|
||||
if (tid < 4)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
}
|
||||
}
|
||||
else if (n > 4)
|
||||
{
|
||||
if (tid < n - 4)
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
|
||||
if (tid < 2)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
}
|
||||
}
|
||||
else if (n > 2)
|
||||
{
|
||||
if (tid < n - 2)
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
if (tid < 2)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
template <> struct WarpReductor<64>
|
||||
{
|
||||
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
data[tid] = partial_reduction;
|
||||
__syncthreads();
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
}
|
||||
}
|
||||
};
|
||||
template <> struct WarpReductor<32>
|
||||
{
|
||||
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
data[tid] = partial_reduction;
|
||||
|
||||
if (tid < 16)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
}
|
||||
}
|
||||
};
|
||||
template <> struct WarpReductor<16>
|
||||
{
|
||||
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
data[tid] = partial_reduction;
|
||||
|
||||
if (tid < 8)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
}
|
||||
}
|
||||
};
|
||||
template <> struct WarpReductor<8>
|
||||
{
|
||||
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
data[tid] = partial_reduction;
|
||||
|
||||
if (tid < 4)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <bool warp> struct ReductionDispatcher;
|
||||
template <> struct ReductionDispatcher<true>
|
||||
{
|
||||
template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
WarpReductor<n>::reduce(data, partial_reduction, tid, op);
|
||||
}
|
||||
};
|
||||
template <> struct ReductionDispatcher<false>
|
||||
{
|
||||
template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
if (tid < n)
|
||||
data[tid] = partial_reduction;
|
||||
__syncthreads();
|
||||
|
||||
|
||||
if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }
|
||||
if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }
|
||||
if (n >= 128) { if (tid < 64) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 64]); } __syncthreads(); }
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PredValWarpReductor
|
||||
|
||||
template <int n> struct PredValWarpReductor;
|
||||
template <> struct PredValWarpReductor<64>
|
||||
{
|
||||
template <typename T, typename V, typename Pred>
|
||||
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
|
||||
{
|
||||
if (tid < 32)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal = sval[tid];
|
||||
|
||||
T reg = sdata[tid + 32];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 32];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 16];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 16];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 8];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 8];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 4];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 4];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 2];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 2];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 1];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
template <> struct PredValWarpReductor<32>
|
||||
{
|
||||
template <typename T, typename V, typename Pred>
|
||||
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
|
||||
{
|
||||
if (tid < 16)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal = sval[tid];
|
||||
|
||||
T reg = sdata[tid + 16];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 16];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 8];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 8];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 4];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 4];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 2];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 2];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 1];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct PredValWarpReductor<16>
|
||||
{
|
||||
template <typename T, typename V, typename Pred>
|
||||
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
|
||||
{
|
||||
if (tid < 8)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal = sval[tid];
|
||||
|
||||
T reg = reg = sdata[tid + 8];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 8];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 4];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 4];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 2];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 2];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 1];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
template <> struct PredValWarpReductor<8>
|
||||
{
|
||||
template <typename T, typename V, typename Pred>
|
||||
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
|
||||
{
|
||||
if (tid < 4)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal = sval[tid];
|
||||
|
||||
T reg = reg = sdata[tid + 4];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 4];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 2];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 2];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 1];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <bool warp> struct PredValReductionDispatcher;
|
||||
template <> struct PredValReductionDispatcher<true>
|
||||
{
|
||||
template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
|
||||
{
|
||||
PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);
|
||||
}
|
||||
};
|
||||
template <> struct PredValReductionDispatcher<false>
|
||||
{
|
||||
template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal = sval[tid];
|
||||
|
||||
if (n >= 512 && tid < 256)
|
||||
{
|
||||
T reg = sdata[tid + 256];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 256];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
if (n >= 256 && tid < 128)
|
||||
{
|
||||
T reg = sdata[tid + 128];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 128];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
if (n >= 128 && tid < 64)
|
||||
{
|
||||
T reg = sdata[tid + 64];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 64];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
if (n >= 64)
|
||||
{
|
||||
T reg = sdata[tid + 32];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 32];
|
||||
}
|
||||
}
|
||||
if (n >= 32)
|
||||
{
|
||||
T reg = sdata[tid + 16];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 16];
|
||||
}
|
||||
}
|
||||
if (n >= 16)
|
||||
{
|
||||
T reg = sdata[tid + 8];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 8];
|
||||
}
|
||||
}
|
||||
if (n >= 8)
|
||||
{
|
||||
T reg = sdata[tid + 4];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 4];
|
||||
}
|
||||
}
|
||||
if (n >= 4)
|
||||
{
|
||||
T reg = sdata[tid + 2];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 2];
|
||||
}
|
||||
}
|
||||
if (n >= 2)
|
||||
{
|
||||
T reg = sdata[tid + 1];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PredVal2WarpReductor
|
||||
|
||||
template <int n> struct PredVal2WarpReductor;
|
||||
template <> struct PredVal2WarpReductor<64>
|
||||
{
|
||||
template <typename T, typename V1, typename V2, typename Pred>
|
||||
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
|
||||
{
|
||||
if (tid < 32)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal1 = sval1[tid];
|
||||
myVal2 = sval2[tid];
|
||||
|
||||
T reg = sdata[tid + 32];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 32];
|
||||
sval2[tid] = myVal2 = sval2[tid + 32];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 16];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 16];
|
||||
sval2[tid] = myVal2 = sval2[tid + 16];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 8];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 8];
|
||||
sval2[tid] = myVal2 = sval2[tid + 8];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 4];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 4];
|
||||
sval2[tid] = myVal2 = sval2[tid + 4];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 2];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 2];
|
||||
sval2[tid] = myVal2 = sval2[tid + 2];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 1];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 1];
|
||||
sval2[tid] = myVal2 = sval2[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
template <> struct PredVal2WarpReductor<32>
|
||||
{
|
||||
template <typename T, typename V1, typename V2, typename Pred>
|
||||
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
|
||||
{
|
||||
if (tid < 16)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal1 = sval1[tid];
|
||||
myVal2 = sval2[tid];
|
||||
|
||||
T reg = sdata[tid + 16];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 16];
|
||||
sval2[tid] = myVal2 = sval2[tid + 16];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 8];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 8];
|
||||
sval2[tid] = myVal2 = sval2[tid + 8];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 4];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 4];
|
||||
sval2[tid] = myVal2 = sval2[tid + 4];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 2];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 2];
|
||||
sval2[tid] = myVal2 = sval2[tid + 2];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 1];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 1];
|
||||
sval2[tid] = myVal2 = sval2[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct PredVal2WarpReductor<16>
|
||||
{
|
||||
template <typename T, typename V1, typename V2, typename Pred>
|
||||
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
|
||||
{
|
||||
if (tid < 8)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal1 = sval1[tid];
|
||||
myVal2 = sval2[tid];
|
||||
|
||||
T reg = reg = sdata[tid + 8];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 8];
|
||||
sval2[tid] = myVal2 = sval2[tid + 8];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 4];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 4];
|
||||
sval2[tid] = myVal2 = sval2[tid + 4];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 2];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 2];
|
||||
sval2[tid] = myVal2 = sval2[tid + 2];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 1];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 1];
|
||||
sval2[tid] = myVal2 = sval2[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
template <> struct PredVal2WarpReductor<8>
|
||||
{
|
||||
template <typename T, typename V1, typename V2, typename Pred>
|
||||
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
|
||||
{
|
||||
if (tid < 4)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal1 = sval1[tid];
|
||||
myVal2 = sval2[tid];
|
||||
|
||||
T reg = reg = sdata[tid + 4];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 4];
|
||||
sval2[tid] = myVal2 = sval2[tid + 4];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 2];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 2];
|
||||
sval2[tid] = myVal2 = sval2[tid + 2];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 1];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 1];
|
||||
sval2[tid] = myVal2 = sval2[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <bool warp> struct PredVal2ReductionDispatcher;
|
||||
template <> struct PredVal2ReductionDispatcher<true>
|
||||
{
|
||||
template <int n, typename T, typename V1, typename V2, typename Pred>
|
||||
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
|
||||
{
|
||||
PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
|
||||
}
|
||||
};
|
||||
template <> struct PredVal2ReductionDispatcher<false>
|
||||
{
|
||||
template <int n, typename T, typename V1, typename V2, typename Pred>
|
||||
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal1 = sval1[tid];
|
||||
myVal2 = sval2[tid];
|
||||
|
||||
if (n >= 512 && tid < 256)
|
||||
{
|
||||
T reg = sdata[tid + 256];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 256];
|
||||
sval2[tid] = myVal2 = sval2[tid + 256];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
if (n >= 256 && tid < 128)
|
||||
{
|
||||
T reg = sdata[tid + 128];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 128];
|
||||
sval2[tid] = myVal2 = sval2[tid + 128];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
if (n >= 128 && tid < 64)
|
||||
{
|
||||
T reg = sdata[tid + 64];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 64];
|
||||
sval2[tid] = myVal2 = sval2[tid + 64];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
if (n >= 64)
|
||||
{
|
||||
T reg = sdata[tid + 32];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 32];
|
||||
sval2[tid] = myVal2 = sval2[tid + 32];
|
||||
}
|
||||
}
|
||||
if (n >= 32)
|
||||
{
|
||||
T reg = sdata[tid + 16];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 16];
|
||||
sval2[tid] = myVal2 = sval2[tid + 16];
|
||||
}
|
||||
}
|
||||
if (n >= 16)
|
||||
{
|
||||
T reg = sdata[tid + 8];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 8];
|
||||
sval2[tid] = myVal2 = sval2[tid + 8];
|
||||
}
|
||||
}
|
||||
if (n >= 8)
|
||||
{
|
||||
T reg = sdata[tid + 4];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 4];
|
||||
sval2[tid] = myVal2 = sval2[tid + 4];
|
||||
}
|
||||
}
|
||||
if (n >= 4)
|
||||
{
|
||||
T reg = sdata[tid + 2];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 2];
|
||||
sval2[tid] = myVal2 = sval2[tid + 2];
|
||||
}
|
||||
}
|
||||
if (n >= 2)
|
||||
{
|
||||
T reg = sdata[tid + 1];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval1[tid] = myVal1 = sval1[tid + 1];
|
||||
sval2[tid] = myVal2 = sval2[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace utility_detail
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif // __OPENCV_GPU_REDUCTION_DETAIL_HPP__
|
@ -44,7 +44,6 @@
|
||||
#define OPENCV_GPU_EMULATION_HPP_
|
||||
|
||||
#include "warp_reduce.hpp"
|
||||
#include <stdio.h>
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
|
@ -302,18 +302,18 @@ namespace cv { namespace gpu { namespace device
|
||||
template <> struct name<type> : binary_function<type, type, type> \
|
||||
{ \
|
||||
__device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
|
||||
__device__ __forceinline__ name(const name& other):binary_function<type, type, type>(){}\
|
||||
__device__ __forceinline__ name():binary_function<type, type, type>(){}\
|
||||
__device__ __forceinline__ name() {}\
|
||||
__device__ __forceinline__ name(const name&) {}\
|
||||
};
|
||||
|
||||
template <typename T> struct maximum : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
|
||||
{
|
||||
return lhs < rhs ? rhs : lhs;
|
||||
return max(lhs, rhs);
|
||||
}
|
||||
__device__ __forceinline__ maximum(const maximum& other):binary_function<T, T, T>(){}
|
||||
__device__ __forceinline__ maximum():binary_function<T, T, T>(){}
|
||||
__device__ __forceinline__ maximum() {}
|
||||
__device__ __forceinline__ maximum(const maximum&) {}
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
|
||||
@ -330,10 +330,10 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
|
||||
{
|
||||
return lhs < rhs ? lhs : rhs;
|
||||
return min(lhs, rhs);
|
||||
}
|
||||
__device__ __forceinline__ minimum(const minimum& other):binary_function<T, T, T>(){}
|
||||
__device__ __forceinline__ minimum():binary_function<T, T, T>(){}
|
||||
__device__ __forceinline__ minimum() {}
|
||||
__device__ __forceinline__ minimum(const minimum&) {}
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
|
||||
@ -350,6 +350,108 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
// Math functions
|
||||
///bound=========================================
|
||||
|
||||
template <typename T> struct abs_func : unary_function<T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType x) const
|
||||
{
|
||||
return abs(x);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ abs_func() {}
|
||||
__device__ __forceinline__ abs_func(const abs_func&) {}
|
||||
};
|
||||
template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
|
||||
{
|
||||
__device__ __forceinline__ unsigned char operator ()(unsigned char x) const
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ abs_func() {}
|
||||
__device__ __forceinline__ abs_func(const abs_func&) {}
|
||||
};
|
||||
template <> struct abs_func<signed char> : unary_function<signed char, signed char>
|
||||
{
|
||||
__device__ __forceinline__ signed char operator ()(signed char x) const
|
||||
{
|
||||
return ::abs((int)x);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ abs_func() {}
|
||||
__device__ __forceinline__ abs_func(const abs_func&) {}
|
||||
};
|
||||
template <> struct abs_func<char> : unary_function<char, char>
|
||||
{
|
||||
__device__ __forceinline__ char operator ()(char x) const
|
||||
{
|
||||
return ::abs((int)x);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ abs_func() {}
|
||||
__device__ __forceinline__ abs_func(const abs_func&) {}
|
||||
};
|
||||
template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
|
||||
{
|
||||
__device__ __forceinline__ unsigned short operator ()(unsigned short x) const
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ abs_func() {}
|
||||
__device__ __forceinline__ abs_func(const abs_func&) {}
|
||||
};
|
||||
template <> struct abs_func<short> : unary_function<short, short>
|
||||
{
|
||||
__device__ __forceinline__ short operator ()(short x) const
|
||||
{
|
||||
return ::abs((int)x);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ abs_func() {}
|
||||
__device__ __forceinline__ abs_func(const abs_func&) {}
|
||||
};
|
||||
template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
|
||||
{
|
||||
__device__ __forceinline__ unsigned int operator ()(unsigned int x) const
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ abs_func() {}
|
||||
__device__ __forceinline__ abs_func(const abs_func&) {}
|
||||
};
|
||||
template <> struct abs_func<int> : unary_function<int, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(int x) const
|
||||
{
|
||||
return ::abs(x);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ abs_func() {}
|
||||
__device__ __forceinline__ abs_func(const abs_func&) {}
|
||||
};
|
||||
template <> struct abs_func<float> : unary_function<float, float>
|
||||
{
|
||||
__device__ __forceinline__ float operator ()(float x) const
|
||||
{
|
||||
return ::fabsf(x);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ abs_func() {}
|
||||
__device__ __forceinline__ abs_func(const abs_func&) {}
|
||||
};
|
||||
template <> struct abs_func<double> : unary_function<double, double>
|
||||
{
|
||||
__device__ __forceinline__ double operator ()(double x) const
|
||||
{
|
||||
return ::fabs(x);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ abs_func() {}
|
||||
__device__ __forceinline__ abs_func(const abs_func&) {}
|
||||
};
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
|
||||
template <typename T> struct name ## _func : unary_function<T, float> \
|
||||
{ \
|
||||
@ -357,6 +459,8 @@ namespace cv { namespace gpu { namespace device
|
||||
{ \
|
||||
return func ## f(v); \
|
||||
} \
|
||||
__device__ __forceinline__ name ## _func() {} \
|
||||
__device__ __forceinline__ name ## _func(const name ## _func&) {} \
|
||||
}; \
|
||||
template <> struct name ## _func<double> : unary_function<double, double> \
|
||||
{ \
|
||||
@ -364,6 +468,8 @@ namespace cv { namespace gpu { namespace device
|
||||
{ \
|
||||
return func(v); \
|
||||
} \
|
||||
__device__ __forceinline__ name ## _func() {} \
|
||||
__device__ __forceinline__ name ## _func(const name ## _func&) {} \
|
||||
};
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
|
||||
@ -382,7 +488,6 @@ namespace cv { namespace gpu { namespace device
|
||||
} \
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
|
||||
|
197
modules/gpu/include/opencv2/gpu/device/reduce.hpp
Normal file
197
modules/gpu/include/opencv2/gpu/device/reduce.hpp
Normal file
@ -0,0 +1,197 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_REDUCE_HPP__
|
||||
#define __OPENCV_GPU_REDUCE_HPP__
|
||||
|
||||
#include <thrust/tuple.h>
|
||||
#include "detail/reduce.hpp"
|
||||
#include "detail/reduce_key_val.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <int N, typename T, class Op>
|
||||
__device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op)
|
||||
{
|
||||
reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
|
||||
}
|
||||
template <int N,
|
||||
typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
|
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
|
||||
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
|
||||
__device__ __forceinline__ void reduce(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
|
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
|
||||
unsigned int tid,
|
||||
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
|
||||
{
|
||||
reduce_detail::Dispatcher<N>::reductor::template reduce<
|
||||
const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>&,
|
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&,
|
||||
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op);
|
||||
}
|
||||
|
||||
template <unsigned int N, typename K, typename V, class Cmp>
|
||||
__device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp)
|
||||
{
|
||||
reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
|
||||
}
|
||||
template <unsigned int N,
|
||||
typename K,
|
||||
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp>
|
||||
__device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key,
|
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
unsigned int tid, const Cmp& cmp)
|
||||
{
|
||||
reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&,
|
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
|
||||
const Cmp&>(skeys, key, svals, val, tid, cmp);
|
||||
}
|
||||
template <unsigned int N,
|
||||
typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
|
||||
typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
|
||||
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
|
||||
__device__ __forceinline__ void reduceKeyVal(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
|
||||
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
|
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
unsigned int tid,
|
||||
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp)
|
||||
{
|
||||
reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<
|
||||
const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>&,
|
||||
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&,
|
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
|
||||
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>&
|
||||
>(skeys, key, svals, val, tid, cmp);
|
||||
}
|
||||
|
||||
// smem_tuple
|
||||
|
||||
template <typename T0>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*>
|
||||
smem_tuple(T0* t0)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*>
|
||||
smem_tuple(T0* t0, T1* t1)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8);
|
||||
}
|
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
|
||||
__device__ __forceinline__
|
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*, volatile T9*>
|
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9)
|
||||
{
|
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9);
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_UTILITY_HPP__
|
@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
|
||||
{
|
||||
return (uchar) ::max((int)v, 0);
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
|
||||
{
|
||||
return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
|
||||
{
|
||||
return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
|
||||
{
|
||||
return (uchar) ::min(v, (uint)UCHAR_MAX);
|
||||
uint res = 0;
|
||||
int vi = v;
|
||||
asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
|
||||
{
|
||||
return saturate_cast<uchar>((uint)v);
|
||||
uint res = 0;
|
||||
asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
|
||||
{
|
||||
uint res = 0;
|
||||
asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
|
||||
{
|
||||
uint res = 0;
|
||||
asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
|
||||
{
|
||||
uint res = 0;
|
||||
asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
|
||||
return res;
|
||||
}
|
||||
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
|
||||
{
|
||||
int iv = __float2int_rn(v);
|
||||
return saturate_cast<uchar>(iv);
|
||||
uint res = 0;
|
||||
asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
|
||||
{
|
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v);
|
||||
return saturate_cast<uchar>(iv);
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
uint res = 0;
|
||||
asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
|
||||
return res;
|
||||
#else
|
||||
return saturate_cast<uchar>((float)v);
|
||||
#endif
|
||||
@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
|
||||
{
|
||||
return (schar) ::min((int)v, SCHAR_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
|
||||
{
|
||||
return (schar) ::min((uint)v, (uint)SCHAR_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
|
||||
{
|
||||
return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
|
||||
uint res = 0;
|
||||
uint vi = v;
|
||||
asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
|
||||
{
|
||||
return saturate_cast<schar>((int)v);
|
||||
uint res = 0;
|
||||
asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
|
||||
{
|
||||
uint res = 0;
|
||||
asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
|
||||
{
|
||||
uint res = 0;
|
||||
asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
|
||||
{
|
||||
return (schar) ::min(v, (uint)SCHAR_MAX);
|
||||
uint res = 0;
|
||||
asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
|
||||
return res;
|
||||
}
|
||||
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
|
||||
{
|
||||
int iv = __float2int_rn(v);
|
||||
return saturate_cast<schar>(iv);
|
||||
uint res = 0;
|
||||
asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
|
||||
{
|
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v);
|
||||
return saturate_cast<schar>(iv);
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
uint res = 0;
|
||||
asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
|
||||
return res;
|
||||
#else
|
||||
return saturate_cast<schar>((float)v);
|
||||
#endif
|
||||
@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
|
||||
{
|
||||
return (ushort) ::max((int)v, 0);
|
||||
ushort res = 0;
|
||||
int vi = v;
|
||||
asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
|
||||
{
|
||||
return (ushort) ::max((int)v, 0);
|
||||
ushort res = 0;
|
||||
asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
|
||||
{
|
||||
return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
|
||||
ushort res = 0;
|
||||
asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
|
||||
{
|
||||
return (ushort) ::min(v, (uint)USHRT_MAX);
|
||||
ushort res = 0;
|
||||
asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
|
||||
{
|
||||
int iv = __float2int_rn(v);
|
||||
return saturate_cast<ushort>(iv);
|
||||
ushort res = 0;
|
||||
asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
|
||||
{
|
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v);
|
||||
return saturate_cast<ushort>(iv);
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
ushort res = 0;
|
||||
asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
|
||||
return res;
|
||||
#else
|
||||
return saturate_cast<ushort>((float)v);
|
||||
#endif
|
||||
@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
|
||||
{
|
||||
return (short) ::min((int)v, SHRT_MAX);
|
||||
short res = 0;
|
||||
asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(int v)
|
||||
{
|
||||
return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
|
||||
short res = 0;
|
||||
asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
|
||||
{
|
||||
return (short) ::min(v, (uint)SHRT_MAX);
|
||||
short res = 0;
|
||||
asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(float v)
|
||||
{
|
||||
int iv = __float2int_rn(v);
|
||||
return saturate_cast<short>(iv);
|
||||
short res = 0;
|
||||
asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(double v)
|
||||
{
|
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v);
|
||||
return saturate_cast<short>(iv);
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
short res = 0;
|
||||
asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
|
||||
return res;
|
||||
#else
|
||||
return saturate_cast<short>((float)v);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ __forceinline__ int saturate_cast<int>(uint v)
|
||||
{
|
||||
int res = 0;
|
||||
asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ int saturate_cast<int>(float v)
|
||||
{
|
||||
return __float2int_rn(v);
|
||||
@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
|
||||
{
|
||||
uint res = 0;
|
||||
int vi = v;
|
||||
asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ uint saturate_cast<uint>(short v)
|
||||
{
|
||||
uint res = 0;
|
||||
asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ uint saturate_cast<uint>(int v)
|
||||
{
|
||||
uint res = 0;
|
||||
asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
|
||||
return res;
|
||||
}
|
||||
template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
|
||||
{
|
||||
return __float2uint_rn(v);
|
||||
|
@ -45,7 +45,6 @@
|
||||
|
||||
#include "saturate_cast.hpp"
|
||||
#include "datamov_utils.hpp"
|
||||
#include "detail/reduction_detail.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Reduction
|
||||
|
||||
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
StaticAssert<n >= 8 && n <= 512>::check();
|
||||
utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
|
||||
}
|
||||
|
||||
template <int n, typename T, typename V, typename Pred>
|
||||
__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
|
||||
{
|
||||
StaticAssert<n >= 8 && n <= 512>::check();
|
||||
utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
|
||||
}
|
||||
|
||||
template <int n, typename T, typename V1, typename V2, typename Pred>
|
||||
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
|
||||
{
|
||||
StaticAssert<n >= 8 && n <= 512>::check();
|
||||
utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Solve linear system
|
||||
|
||||
|
@ -43,7 +43,7 @@
|
||||
#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
|
||||
#define __OPENCV_GPU_VEC_DISTANCE_HPP__
|
||||
|
||||
#include "utility.hpp"
|
||||
#include "reduce.hpp"
|
||||
#include "functional.hpp"
|
||||
#include "detail/vec_distance_detail.hpp"
|
||||
|
||||
@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator int() const
|
||||
@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator float() const
|
||||
@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator float() const
|
||||
@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator int() const
|
||||
|
@ -280,7 +280,7 @@ namespace cv { namespace gpu { namespace device
|
||||
OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \
|
||||
OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \
|
||||
OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \
|
||||
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \
|
||||
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, abs, abs_func) \
|
||||
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \
|
||||
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \
|
||||
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \
|
||||
|
145
modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
Normal file
145
modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
Normal file
@ -0,0 +1,145 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
|
||||
#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
return __shfl(val, srcLane, width);
|
||||
#else
|
||||
return T();
|
||||
#endif
|
||||
}
|
||||
__device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
return (unsigned int) __shfl((int) val, srcLane, width);
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
__device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
int lo = __double2loint(val);
|
||||
int hi = __double2hiint(val);
|
||||
|
||||
lo = __shfl(lo, srcLane, width);
|
||||
hi = __shfl(hi, srcLane, width);
|
||||
|
||||
return __hiloint2double(hi, lo);
|
||||
#else
|
||||
return 0.0;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
return __shfl_down(val, delta, width);
|
||||
#else
|
||||
return T();
|
||||
#endif
|
||||
}
|
||||
__device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
return (unsigned int) __shfl_down((int) val, delta, width);
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
__device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
int lo = __double2loint(val);
|
||||
int hi = __double2hiint(val);
|
||||
|
||||
lo = __shfl_down(lo, delta, width);
|
||||
hi = __shfl_down(hi, delta, width);
|
||||
|
||||
return __hiloint2double(hi, lo);
|
||||
#else
|
||||
return 0.0;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
return __shfl_up(val, delta, width);
|
||||
#else
|
||||
return T();
|
||||
#endif
|
||||
}
|
||||
__device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
return (unsigned int) __shfl_up((int) val, delta, width);
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
__device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
int lo = __double2loint(val);
|
||||
int hi = __double2hiint(val);
|
||||
|
||||
lo = __shfl_up(lo, delta, width);
|
||||
hi = __shfl_up(hi, delta, width);
|
||||
|
||||
return __hiloint2double(hi, lo);
|
||||
#else
|
||||
return 0.0;
|
||||
#endif
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
|
26
modules/gpu/misc/carma.toolchain.cmake
Normal file
26
modules/gpu/misc/carma.toolchain.cmake
Normal file
@ -0,0 +1,26 @@
|
||||
set(CMAKE_SYSTEM_NAME Linux)
|
||||
set(CMAKE_SYSTEM_VERSION 1)
|
||||
set(CMAKE_SYSTEM_PROCESSOR arm)
|
||||
|
||||
set(CMAKE_C_COMPILER arm-linux-gnueabi-gcc-4.5)
|
||||
set(CMAKE_CXX_COMPILER arm-linux-gnueabi-g++-4.5)
|
||||
|
||||
#suppress compiller varning
|
||||
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-psabi" )
|
||||
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-psabi" )
|
||||
|
||||
# can be any other plases
|
||||
set(__arm_linux_eabi_root /usr/arm-linux-gnueabi)
|
||||
|
||||
set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${__arm_linux_eabi_root})
|
||||
|
||||
if(EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
|
||||
set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${CUDA_TOOLKIT_ROOT_DIR})
|
||||
endif()
|
||||
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
|
||||
|
||||
set(CARMA 1)
|
||||
add_definitions(-DCARMA)
|
File diff suppressed because it is too large
Load Diff
@ -581,13 +581,12 @@ PERF_TEST_P(Sz, ImgProc_CalcHist, GPU_TYPICAL_MAT_SIZES)
|
||||
{
|
||||
cv::gpu::GpuMat d_src(src);
|
||||
cv::gpu::GpuMat d_hist;
|
||||
cv::gpu::GpuMat d_buf;
|
||||
|
||||
cv::gpu::calcHist(d_src, d_hist, d_buf);
|
||||
cv::gpu::calcHist(d_src, d_hist);
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
cv::gpu::calcHist(d_src, d_hist, d_buf);
|
||||
cv::gpu::calcHist(d_src, d_hist);
|
||||
}
|
||||
|
||||
GPU_SANITY_CHECK(d_hist);
|
||||
@ -1512,13 +1511,13 @@ PERF_TEST_P(Sz_Depth_Code, ImgProc_CvtColor, Combine(
|
||||
CvtColorInfo(3, 3, cv::COLOR_BGR2HLS),
|
||||
CvtColorInfo(3, 3, cv::COLOR_HLS2BGR),
|
||||
CvtColorInfo(3, 3, cv::COLOR_BGR2Lab),
|
||||
CvtColorInfo(3, 3, cv::COLOR_RGB2Lab),
|
||||
CvtColorInfo(3, 3, cv::COLOR_LBGR2Lab),
|
||||
CvtColorInfo(3, 3, cv::COLOR_BGR2Luv),
|
||||
CvtColorInfo(3, 3, cv::COLOR_RGB2Luv),
|
||||
CvtColorInfo(3, 3, cv::COLOR_LBGR2Luv),
|
||||
CvtColorInfo(3, 3, cv::COLOR_Lab2BGR),
|
||||
CvtColorInfo(3, 3, cv::COLOR_Lab2RGB),
|
||||
CvtColorInfo(3, 3, cv::COLOR_Luv2BGR),
|
||||
CvtColorInfo(3, 3, cv::COLOR_Lab2LBGR),
|
||||
CvtColorInfo(3, 3, cv::COLOR_Luv2RGB),
|
||||
CvtColorInfo(3, 3, cv::COLOR_Luv2LRGB),
|
||||
CvtColorInfo(1, 3, cv::COLOR_BayerBG2BGR),
|
||||
CvtColorInfo(1, 3, cv::COLOR_BayerGB2BGR),
|
||||
CvtColorInfo(1, 3, cv::COLOR_BayerRG2BGR),
|
||||
@ -1706,10 +1705,30 @@ PERF_TEST_P(Sz_Depth_Cn, ImgProc_ImagePyramidGetLayer, Combine(GPU_TYPICAL_MAT_S
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct Vec3fComparator
|
||||
{
|
||||
bool operator()(const cv::Vec3f& a, const cv::Vec3f b) const
|
||||
{
|
||||
if(a[0] != b[0]) return a[0] < b[0];
|
||||
else if(a[1] != b[1]) return a[1] < b[1];
|
||||
else return a[2] < b[2];
|
||||
}
|
||||
};
|
||||
struct Vec2fComparator
|
||||
{
|
||||
bool operator()(const cv::Vec2f& a, const cv::Vec2f b) const
|
||||
{
|
||||
if(a[0] != b[0]) return a[0] < b[0];
|
||||
else return a[1] < b[1];
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// HoughLines
|
||||
|
||||
PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
|
||||
PERF_TEST_P(Sz, ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
|
||||
{
|
||||
declare.time(30.0);
|
||||
|
||||
@ -1744,7 +1763,11 @@ PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
|
||||
cv::gpu::HoughLines(d_src, d_lines, d_buf, rho, theta, threshold);
|
||||
}
|
||||
|
||||
GPU_SANITY_CHECK(d_lines);
|
||||
cv::Mat h_lines(d_lines);
|
||||
cv::Vec2f* begin = (cv::Vec2f*)(h_lines.ptr<char>(0));
|
||||
cv::Vec2f* end = (cv::Vec2f*)(h_lines.ptr<char>(0) + (h_lines.cols) * 2 * sizeof(float));
|
||||
std::sort(begin, end, Vec2fComparator());
|
||||
SANITY_CHECK(h_lines);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1756,7 +1779,8 @@ PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
|
||||
cv::HoughLines(src, lines, rho, theta, threshold);
|
||||
}
|
||||
|
||||
CPU_SANITY_CHECK(lines);
|
||||
std::sort(lines.begin(), lines.end(), Vec2fComparator());
|
||||
SANITY_CHECK(lines);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1804,7 +1828,11 @@ PERF_TEST_P(Sz_Dp_MinDist, ImgProc_HoughCircles, Combine(GPU_TYPICAL_MAT_SIZES,
|
||||
cv::gpu::HoughCircles(d_src, d_circles, d_buf, CV_HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
|
||||
}
|
||||
|
||||
GPU_SANITY_CHECK(d_circles);
|
||||
cv::Mat h_circles(d_circles);
|
||||
cv::Vec3f* begin = (cv::Vec3f*)(h_circles.ptr<char>(0));
|
||||
cv::Vec3f* end = (cv::Vec3f*)(h_circles.ptr<char>(0) + (h_circles.cols) * 3 * sizeof(float));
|
||||
std::sort(begin, end, Vec3fComparator());
|
||||
SANITY_CHECK(h_circles);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1817,7 +1845,8 @@ PERF_TEST_P(Sz_Dp_MinDist, ImgProc_HoughCircles, Combine(GPU_TYPICAL_MAT_SIZES,
|
||||
cv::HoughCircles(src, circles, CV_HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
|
||||
}
|
||||
|
||||
CPU_SANITY_CHECK(circles);
|
||||
std::sort(circles.begin(), circles.end(), Vec3fComparator());
|
||||
SANITY_CHECK(circles);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -89,7 +89,6 @@ PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gp
|
||||
SANITY_CHECK(found_locations);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// HaarClassifier
|
||||
|
||||
|
@ -68,11 +68,16 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool,
|
||||
void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
|
||||
{
|
||||
#ifndef HAVE_CUBLAS
|
||||
(void)src1; (void)src2; (void)alpha; (void)src3; (void)beta; (void)dst; (void)flags; (void)stream;
|
||||
(void)src1;
|
||||
(void)src2;
|
||||
(void)alpha;
|
||||
(void)src3;
|
||||
(void)beta;
|
||||
(void)dst;
|
||||
(void)flags;
|
||||
(void)stream;
|
||||
CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS");
|
||||
|
||||
#else
|
||||
|
||||
// CUBLAS works with column-major matrices
|
||||
|
||||
CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
|
||||
@ -80,7 +85,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
|
||||
|
||||
if (src1.depth() == CV_64F)
|
||||
{
|
||||
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
|
||||
if (!deviceSupports(NATIVE_DOUBLE))
|
||||
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
|
||||
}
|
||||
|
||||
@ -188,7 +193,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
|
||||
}
|
||||
|
||||
cublasSafeCall( cublasDestroy_v2(handle) );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -227,7 +231,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
|
||||
}
|
||||
else // if (src.elemSize() == 8)
|
||||
{
|
||||
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
|
||||
if (!deviceSupports(NATIVE_DOUBLE))
|
||||
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
|
||||
|
||||
NppStStreamHandler h(stream);
|
||||
|
@ -88,71 +88,71 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
}
|
||||
|
||||
namespace bf_knnmatch
|
||||
{
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
}
|
||||
|
||||
namespace bf_radius_match
|
||||
{
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
@ -198,11 +198,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
|
||||
if (query.empty() || train.empty())
|
||||
return;
|
||||
|
||||
using namespace ::cv::gpu::device::bf_match;
|
||||
using namespace cv::gpu::device::bf_match;
|
||||
|
||||
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3][6] =
|
||||
{
|
||||
@ -234,10 +234,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
|
||||
caller_t func = callers[distType][query.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
func(query, train, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream));
|
||||
func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector<DMatch>& matches)
|
||||
@ -268,14 +265,14 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
|
||||
const float* distance_ptr = distance.ptr<float>();
|
||||
for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)
|
||||
{
|
||||
int _trainIdx = *trainIdx_ptr;
|
||||
int train_idx = *trainIdx_ptr;
|
||||
|
||||
if (_trainIdx == -1)
|
||||
if (train_idx == -1)
|
||||
continue;
|
||||
|
||||
float _distance = *distance_ptr;
|
||||
float distance_local = *distance_ptr;
|
||||
|
||||
DMatch m(queryIdx, _trainIdx, 0, _distance);
|
||||
DMatch m(queryIdx, train_idx, 0, distance_local);
|
||||
|
||||
matches.push_back(m);
|
||||
}
|
||||
@ -340,11 +337,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
|
||||
if (query.empty() || trainCollection.empty())
|
||||
return;
|
||||
|
||||
using namespace ::cv::gpu::device::bf_match;
|
||||
using namespace cv::gpu::device::bf_match;
|
||||
|
||||
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3][6] =
|
||||
{
|
||||
@ -376,10 +373,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
|
||||
caller_t func = callers[distType][query.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
func(query, trainCollection, masks, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
|
||||
func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
|
||||
@ -413,16 +407,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
|
||||
const float* distance_ptr = distance.ptr<float>();
|
||||
for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
|
||||
{
|
||||
int trainIdx = *trainIdx_ptr;
|
||||
int _trainIdx = *trainIdx_ptr;
|
||||
|
||||
if (trainIdx == -1)
|
||||
if (_trainIdx == -1)
|
||||
continue;
|
||||
|
||||
int imgIdx = *imgIdx_ptr;
|
||||
int _imgIdx = *imgIdx_ptr;
|
||||
|
||||
float distance = *distance_ptr;
|
||||
float _distance = *distance_ptr;
|
||||
|
||||
DMatch m(queryIdx, trainIdx, imgIdx, distance);
|
||||
DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
|
||||
|
||||
matches.push_back(m);
|
||||
}
|
||||
@ -451,11 +445,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
|
||||
if (query.empty() || train.empty())
|
||||
return;
|
||||
|
||||
using namespace ::cv::gpu::device::bf_knnmatch;
|
||||
using namespace cv::gpu::device::bf_knnmatch;
|
||||
|
||||
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3][6] =
|
||||
{
|
||||
@ -502,10 +496,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
|
||||
caller_t func = callers[distType][query.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream));
|
||||
func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
|
||||
@ -548,13 +539,13 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, c
|
||||
|
||||
for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
|
||||
{
|
||||
int trainIdx = *trainIdx_ptr;
|
||||
int _trainIdx = *trainIdx_ptr;
|
||||
|
||||
if (trainIdx != -1)
|
||||
if (_trainIdx != -1)
|
||||
{
|
||||
float distance = *distance_ptr;
|
||||
float _distance = *distance_ptr;
|
||||
|
||||
DMatch m(queryIdx, trainIdx, 0, distance);
|
||||
DMatch m(queryIdx, _trainIdx, 0, _distance);
|
||||
|
||||
curMatches.push_back(m);
|
||||
}
|
||||
@ -580,11 +571,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
|
||||
if (query.empty() || trainCollection.empty())
|
||||
return;
|
||||
|
||||
using namespace ::cv::gpu::device::bf_knnmatch;
|
||||
using namespace cv::gpu::device::bf_knnmatch;
|
||||
|
||||
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3][6] =
|
||||
{
|
||||
@ -621,10 +612,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
|
||||
caller_t func = callers[distType][query.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
|
||||
func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
|
||||
@ -667,15 +655,15 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Convert(const Mat& trainIdx,
|
||||
|
||||
for (int i = 0; i < 2; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
|
||||
{
|
||||
int trainIdx = *trainIdx_ptr;
|
||||
int _trainIdx = *trainIdx_ptr;
|
||||
|
||||
if (trainIdx != -1)
|
||||
if (_trainIdx != -1)
|
||||
{
|
||||
int imgIdx = *imgIdx_ptr;
|
||||
int _imgIdx = *imgIdx_ptr;
|
||||
|
||||
float distance = *distance_ptr;
|
||||
float _distance = *distance_ptr;
|
||||
|
||||
DMatch m(queryIdx, trainIdx, imgIdx, distance);
|
||||
DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
|
||||
|
||||
curMatches.push_back(m);
|
||||
}
|
||||
@ -765,7 +753,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
|
||||
|
||||
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3][6] =
|
||||
{
|
||||
@ -786,12 +774,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
|
||||
}
|
||||
};
|
||||
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
|
||||
CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
|
||||
|
||||
const int nQuery = query.rows;
|
||||
const int nTrain = train.rows;
|
||||
|
||||
@ -814,7 +796,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
|
||||
caller_t func = callers[distType][query.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
|
||||
func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
|
||||
@ -852,25 +834,25 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
|
||||
const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
|
||||
const float* distance_ptr = distance.ptr<float>(queryIdx);
|
||||
|
||||
const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
|
||||
const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
|
||||
|
||||
if (nMatches == 0)
|
||||
if (nMatched == 0)
|
||||
{
|
||||
if (!compactResult)
|
||||
matches.push_back(vector<DMatch>());
|
||||
continue;
|
||||
}
|
||||
|
||||
matches.push_back(vector<DMatch>(nMatches));
|
||||
matches.push_back(vector<DMatch>(nMatched));
|
||||
vector<DMatch>& curMatches = matches.back();
|
||||
|
||||
for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)
|
||||
for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++distance_ptr)
|
||||
{
|
||||
int trainIdx = *trainIdx_ptr;
|
||||
int _trainIdx = *trainIdx_ptr;
|
||||
|
||||
float distance = *distance_ptr;
|
||||
float _distance = *distance_ptr;
|
||||
|
||||
DMatch m(queryIdx, trainIdx, 0, distance);
|
||||
DMatch m(queryIdx, _trainIdx, 0, _distance);
|
||||
|
||||
curMatches[i] = m;
|
||||
}
|
||||
@ -897,7 +879,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
|
||||
|
||||
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3][6] =
|
||||
{
|
||||
@ -918,12 +900,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
|
||||
}
|
||||
};
|
||||
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
|
||||
CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
|
||||
|
||||
const int nQuery = query.rows;
|
||||
|
||||
CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
|
||||
@ -949,7 +925,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
|
||||
vector<PtrStepSzb> masks_(masks.begin(), masks.end());
|
||||
|
||||
func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
|
||||
trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
|
||||
trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
|
||||
@ -990,9 +966,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
|
||||
const int* imgIdx_ptr = imgIdx.ptr<int>(queryIdx);
|
||||
const float* distance_ptr = distance.ptr<float>(queryIdx);
|
||||
|
||||
const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
|
||||
const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
|
||||
|
||||
if (nMatches == 0)
|
||||
if (nMatched == 0)
|
||||
{
|
||||
if (!compactResult)
|
||||
matches.push_back(vector<DMatch>());
|
||||
@ -1001,9 +977,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
|
||||
|
||||
matches.push_back(vector<DMatch>());
|
||||
vector<DMatch>& curMatches = matches.back();
|
||||
curMatches.reserve(nMatches);
|
||||
curMatches.reserve(nMatched);
|
||||
|
||||
for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
|
||||
for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
|
||||
{
|
||||
int _trainIdx = *trainIdx_ptr;
|
||||
int _imgIdx = *imgIdx_ptr;
|
||||
|
@ -622,7 +622,7 @@ private:
|
||||
}
|
||||
|
||||
// copy data structures on gpu
|
||||
stage_mat.upload(cv::Mat(1, stages.size() * sizeof(Stage), CV_8UC1, (uchar*)&(stages[0]) ));
|
||||
stage_mat.upload(cv::Mat(1, (int) (stages.size() * sizeof(Stage)), CV_8UC1, (uchar*)&(stages[0]) ));
|
||||
trees_mat.upload(cv::Mat(cl_trees).reshape(1,1));
|
||||
nodes_mat.upload(cv::Mat(cl_nodes).reshape(1,1));
|
||||
leaves_mat.upload(cv::Mat(cl_leaves).reshape(1,1));
|
||||
|
@ -53,7 +53,7 @@ void cv::gpu::gammaCorrection(const GpuMat&, GpuMat&, bool, Stream&) { throw_nog
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
#include <cvt_colot_internal.h>
|
||||
#include "cvt_color_internal.h"
|
||||
|
||||
namespace cv { namespace gpu {
|
||||
namespace device
|
||||
@ -69,7 +69,7 @@ using namespace ::cv::gpu::device;
|
||||
|
||||
namespace
|
||||
{
|
||||
typedef void (*gpu_func_t)(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
|
||||
typedef void (*gpu_func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
void bgr_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
|
||||
{
|
||||
@ -1155,154 +1155,420 @@ namespace
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void bgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& st)
|
||||
void bgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
#if (CUDA_VERSION < 5000)
|
||||
(void)src;
|
||||
(void)dst;
|
||||
(void)dcn;
|
||||
(void)st;
|
||||
CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
|
||||
#else
|
||||
CV_Assert(src.depth() == CV_8U);
|
||||
CV_Assert(src.channels() == 3);
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{bgr_to_lab_8u, bgr_to_lab_32f},
|
||||
{bgra_to_lab_8u, bgra_to_lab_32f}
|
||||
},
|
||||
{
|
||||
{bgr_to_lab4_8u, bgr_to_lab4_32f},
|
||||
{bgra_to_lab4_8u, bgra_to_lab4_32f}
|
||||
}
|
||||
};
|
||||
|
||||
dcn = src.channels();
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(st);
|
||||
NppStreamHandler h(stream);
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
NppiSize oSizeROI;
|
||||
oSizeROI.width = src.cols;
|
||||
oSizeROI.height = src.rows;
|
||||
|
||||
nppSafeCall( nppiBGRToLab_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
#endif
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void rgb_to_lab(const GpuMat& src, GpuMat& dst, int, Stream& stream)
|
||||
void rgb_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
bgr_to_rgb(src, dst, -1, stream);
|
||||
bgr_to_lab(dst, dst, -1, stream);
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{rgb_to_lab_8u, rgb_to_lab_32f},
|
||||
{rgba_to_lab_8u, rgba_to_lab_32f}
|
||||
},
|
||||
{
|
||||
{rgb_to_lab4_8u, rgb_to_lab4_32f},
|
||||
{rgba_to_lab4_8u, rgba_to_lab4_32f}
|
||||
}
|
||||
};
|
||||
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void lab_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& st)
|
||||
void lbgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
#if (CUDA_VERSION < 5000)
|
||||
(void)src;
|
||||
(void)dst;
|
||||
(void)dcn;
|
||||
(void)st;
|
||||
CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
|
||||
#else
|
||||
CV_Assert(src.depth() == CV_8U);
|
||||
CV_Assert(src.channels() == 3);
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{lbgr_to_lab_8u, lbgr_to_lab_32f},
|
||||
{lbgra_to_lab_8u, lbgra_to_lab_32f}
|
||||
},
|
||||
{
|
||||
{lbgr_to_lab4_8u, lbgr_to_lab4_32f},
|
||||
{lbgra_to_lab4_8u, lbgra_to_lab4_32f}
|
||||
}
|
||||
};
|
||||
|
||||
dcn = src.channels();
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(st);
|
||||
NppStreamHandler h(stream);
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
NppiSize oSizeROI;
|
||||
oSizeROI.width = src.cols;
|
||||
oSizeROI.height = src.rows;
|
||||
|
||||
nppSafeCall( nppiLabToBGR_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
#endif
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void lab_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
|
||||
void lrgb_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
lab_to_bgr(src, dst, -1, stream);
|
||||
bgr_to_rgb(dst, dst, -1, stream);
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{lrgb_to_lab_8u, lrgb_to_lab_32f},
|
||||
{lrgba_to_lab_8u, lrgba_to_lab_32f}
|
||||
},
|
||||
{
|
||||
{lrgb_to_lab4_8u, lrgb_to_lab4_32f},
|
||||
{lrgba_to_lab4_8u, lrgba_to_lab4_32f}
|
||||
}
|
||||
};
|
||||
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void rgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& st)
|
||||
void lab_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
#if (CUDA_VERSION < 5000)
|
||||
(void)src;
|
||||
(void)dst;
|
||||
(void)dcn;
|
||||
(void)st;
|
||||
CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
|
||||
#else
|
||||
CV_Assert(src.depth() == CV_8U);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{lab_to_bgr_8u, lab_to_bgr_32f},
|
||||
{lab4_to_bgr_8u, lab4_to_bgr_32f}
|
||||
},
|
||||
{
|
||||
{lab_to_bgra_8u, lab_to_bgra_32f},
|
||||
{lab4_to_bgra_8u, lab4_to_bgra_32f}
|
||||
}
|
||||
};
|
||||
|
||||
dcn = src.channels();
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(st);
|
||||
NppStreamHandler h(stream);
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
NppiSize oSizeROI;
|
||||
oSizeROI.width = src.cols;
|
||||
oSizeROI.height = src.rows;
|
||||
|
||||
if (dcn == 3)
|
||||
nppSafeCall( nppiRGBToLUV_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
|
||||
else
|
||||
nppSafeCall( nppiRGBToLUV_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
#endif
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void bgr_to_luv(const GpuMat& src, GpuMat& dst, int, Stream& stream)
|
||||
void lab_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
bgr_to_rgb(src, dst, -1, stream);
|
||||
rgb_to_luv(dst, dst, -1, stream);
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{lab_to_rgb_8u, lab_to_rgb_32f},
|
||||
{lab4_to_rgb_8u, lab4_to_rgb_32f}
|
||||
},
|
||||
{
|
||||
{lab_to_rgba_8u, lab_to_rgba_32f},
|
||||
{lab4_to_rgba_8u, lab4_to_rgba_32f}
|
||||
}
|
||||
};
|
||||
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void luv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& st)
|
||||
void lab_to_lbgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
#if (CUDA_VERSION < 5000)
|
||||
(void)src;
|
||||
(void)dst;
|
||||
(void)dcn;
|
||||
(void)st;
|
||||
CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
|
||||
#else
|
||||
CV_Assert(src.depth() == CV_8U);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{lab_to_lbgr_8u, lab_to_lbgr_32f},
|
||||
{lab4_to_lbgr_8u, lab4_to_lbgr_32f}
|
||||
},
|
||||
{
|
||||
{lab_to_lbgra_8u, lab_to_lbgra_32f},
|
||||
{lab4_to_lbgra_8u, lab4_to_lbgra_32f}
|
||||
}
|
||||
};
|
||||
|
||||
dcn = src.channels();
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(st);
|
||||
NppStreamHandler h(stream);
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
NppiSize oSizeROI;
|
||||
oSizeROI.width = src.cols;
|
||||
oSizeROI.height = src.rows;
|
||||
|
||||
if (dcn == 3)
|
||||
nppSafeCall( nppiLUVToRGB_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
|
||||
else
|
||||
nppSafeCall( nppiLUVToRGB_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
#endif
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void luv_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
|
||||
void lab_to_lrgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
luv_to_rgb(src, dst, -1, stream);
|
||||
bgr_to_rgb(dst, dst, -1, stream);
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{lab_to_lrgb_8u, lab_to_lrgb_32f},
|
||||
{lab4_to_lrgb_8u, lab4_to_lrgb_32f}
|
||||
},
|
||||
{
|
||||
{lab_to_lrgba_8u, lab_to_lrgba_32f},
|
||||
{lab4_to_lrgba_8u, lab4_to_lrgba_32f}
|
||||
}
|
||||
};
|
||||
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void bgr_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{bgr_to_luv_8u, bgr_to_luv_32f},
|
||||
{bgra_to_luv_8u, bgra_to_luv_32f}
|
||||
},
|
||||
{
|
||||
{bgr_to_luv4_8u, bgr_to_luv4_32f},
|
||||
{bgra_to_luv4_8u, bgra_to_luv4_32f}
|
||||
}
|
||||
};
|
||||
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void rgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{rgb_to_luv_8u, rgb_to_luv_32f},
|
||||
{rgba_to_luv_8u, rgba_to_luv_32f}
|
||||
},
|
||||
{
|
||||
{rgb_to_luv4_8u, rgb_to_luv4_32f},
|
||||
{rgba_to_luv4_8u, rgba_to_luv4_32f}
|
||||
}
|
||||
};
|
||||
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void lbgr_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{lbgr_to_luv_8u, lbgr_to_luv_32f},
|
||||
{lbgra_to_luv_8u, lbgra_to_luv_32f}
|
||||
},
|
||||
{
|
||||
{lbgr_to_luv4_8u, lbgr_to_luv4_32f},
|
||||
{lbgra_to_luv4_8u, lbgra_to_luv4_32f}
|
||||
}
|
||||
};
|
||||
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void lrgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{lrgb_to_luv_8u, lrgb_to_luv_32f},
|
||||
{lrgba_to_luv_8u, lrgba_to_luv_32f}
|
||||
},
|
||||
{
|
||||
{lrgb_to_luv4_8u, lrgb_to_luv4_32f},
|
||||
{lrgba_to_luv4_8u, lrgba_to_luv4_32f}
|
||||
}
|
||||
};
|
||||
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void luv_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{luv_to_bgr_8u, luv_to_bgr_32f},
|
||||
{luv4_to_bgr_8u, luv4_to_bgr_32f}
|
||||
},
|
||||
{
|
||||
{luv_to_bgra_8u, luv_to_bgra_32f},
|
||||
{luv4_to_bgra_8u, luv4_to_bgra_32f}
|
||||
}
|
||||
};
|
||||
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void luv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{luv_to_rgb_8u, luv_to_rgb_32f},
|
||||
{luv4_to_rgb_8u, luv4_to_rgb_32f}
|
||||
},
|
||||
{
|
||||
{luv_to_rgba_8u, luv_to_rgba_32f},
|
||||
{luv4_to_rgba_8u, luv4_to_rgba_32f}
|
||||
}
|
||||
};
|
||||
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void luv_to_lbgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{luv_to_lbgr_8u, luv_to_lbgr_32f},
|
||||
{luv4_to_lbgr_8u, luv4_to_lbgr_32f}
|
||||
},
|
||||
{
|
||||
{luv_to_lbgra_8u, luv_to_lbgra_32f},
|
||||
{luv4_to_lbgra_8u, luv4_to_lbgra_32f}
|
||||
}
|
||||
};
|
||||
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void luv_to_lrgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
using namespace cv::gpu::device;
|
||||
static const gpu_func_t funcs[2][2][2] =
|
||||
{
|
||||
{
|
||||
{luv_to_lrgb_8u, luv_to_lrgb_32f},
|
||||
{luv4_to_lrgb_8u, luv4_to_lrgb_32f}
|
||||
},
|
||||
{
|
||||
{luv_to_lrgba_8u, luv_to_lrgba_32f},
|
||||
{luv4_to_lrgba_8u, luv4_to_lrgba_32f}
|
||||
}
|
||||
};
|
||||
|
||||
if (dcn <= 0) dcn = 3;
|
||||
|
||||
CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
|
||||
CV_Assert(src.channels() == 3 || src.channels() == 4);
|
||||
CV_Assert(dcn == 3 || dcn == 4);
|
||||
|
||||
dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
|
||||
|
||||
funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void rgba_to_mbgra(const GpuMat& src, GpuMat& dst, int, Stream& st)
|
||||
@ -1475,15 +1741,15 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
|
||||
hls_to_bgr_full, // CV_HLS2BGR_FULL = 72
|
||||
hls_to_rgb_full, // CV_HLS2RGB_FULL = 73
|
||||
|
||||
0, // CV_LBGR2Lab = 74
|
||||
0, // CV_LRGB2Lab = 75
|
||||
0, // CV_LBGR2Luv = 76
|
||||
0, // CV_LRGB2Luv = 77
|
||||
lbgr_to_lab, // CV_LBGR2Lab = 74
|
||||
lrgb_to_lab, // CV_LRGB2Lab = 75
|
||||
lbgr_to_luv, // CV_LBGR2Luv = 76
|
||||
lrgb_to_luv, // CV_LRGB2Luv = 77
|
||||
|
||||
0, // CV_Lab2LBGR = 78
|
||||
0, // CV_Lab2LRGB = 79
|
||||
0, // CV_Luv2LBGR = 80
|
||||
0, // CV_Luv2LRGB = 81
|
||||
lab_to_lbgr, // CV_Lab2LBGR = 78
|
||||
lab_to_lrgb, // CV_Lab2LRGB = 79
|
||||
luv_to_lbgr, // CV_Luv2LBGR = 80
|
||||
luv_to_lrgb, // CV_Luv2LRGB = 81
|
||||
|
||||
bgr_to_yuv, // CV_BGR2YUV = 82
|
||||
rgb_to_yuv, // CV_RGB2YUV = 83
|
||||
|
@ -42,10 +42,13 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/utility.hpp"
|
||||
#include "opencv2/gpu/device/reduce.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/vec_distance.hpp"
|
||||
#include "opencv2/gpu/device/datamov_utils.hpp"
|
||||
#include "opencv2/gpu/device/warp_shuffle.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device
|
||||
int& bestTrainIdx1, int& bestTrainIdx2,
|
||||
float* s_distance, int* s_trainIdx)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
(void) s_distance;
|
||||
(void) s_trainIdx;
|
||||
|
||||
float d1, d2;
|
||||
int i1, i2;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
|
||||
{
|
||||
d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
|
||||
d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
|
||||
i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
|
||||
i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
|
||||
|
||||
if (bestDistance1 < d1)
|
||||
{
|
||||
if (d1 < bestDistance2)
|
||||
{
|
||||
bestDistance2 = d1;
|
||||
bestTrainIdx2 = i1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
bestDistance2 = bestDistance1;
|
||||
bestTrainIdx2 = bestTrainIdx1;
|
||||
|
||||
bestDistance1 = d1;
|
||||
bestTrainIdx1 = i1;
|
||||
|
||||
if (d2 < bestDistance2)
|
||||
{
|
||||
bestDistance2 = d2;
|
||||
bestTrainIdx2 = i2;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
float myBestDistance1 = numeric_limits<float>::max();
|
||||
float myBestDistance2 = numeric_limits<float>::max();
|
||||
int myBestTrainIdx1 = -1;
|
||||
@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
bestTrainIdx1 = myBestTrainIdx1;
|
||||
bestTrainIdx2 = myBestTrainIdx2;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int BLOCK_SIZE>
|
||||
@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device
|
||||
int& bestImgIdx1, int& bestImgIdx2,
|
||||
float* s_distance, int* s_trainIdx, int* s_imgIdx)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
(void) s_distance;
|
||||
(void) s_trainIdx;
|
||||
(void) s_imgIdx;
|
||||
|
||||
float d1, d2;
|
||||
int i1, i2;
|
||||
int j1, j2;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
|
||||
{
|
||||
d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
|
||||
d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
|
||||
i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
|
||||
i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
|
||||
j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
|
||||
j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
|
||||
|
||||
if (bestDistance1 < d1)
|
||||
{
|
||||
if (d1 < bestDistance2)
|
||||
{
|
||||
bestDistance2 = d1;
|
||||
bestTrainIdx2 = i1;
|
||||
bestImgIdx2 = j1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
bestDistance2 = bestDistance1;
|
||||
bestTrainIdx2 = bestTrainIdx1;
|
||||
bestImgIdx2 = bestImgIdx1;
|
||||
|
||||
bestDistance1 = d1;
|
||||
bestTrainIdx1 = i1;
|
||||
bestImgIdx1 = j1;
|
||||
|
||||
if (d2 < bestDistance2)
|
||||
{
|
||||
bestDistance2 = d2;
|
||||
bestTrainIdx2 = i2;
|
||||
bestImgIdx2 = j2;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
float myBestDistance1 = numeric_limits<float>::max();
|
||||
float myBestDistance2 = numeric_limits<float>::max();
|
||||
int myBestTrainIdx1 = -1;
|
||||
@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
bestImgIdx1 = myBestImgIdx1;
|
||||
bestImgIdx2 = myBestImgIdx2;
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@ -748,9 +839,8 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
(void)cc;
|
||||
if (query.cols <= 64)
|
||||
{
|
||||
matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
|
||||
@ -780,9 +870,8 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
(void)cc;
|
||||
if (query.cols <= 64)
|
||||
{
|
||||
matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
|
||||
@ -945,9 +1034,8 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
|
||||
const PtrStepSzf& allDist,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
(void)cc;
|
||||
if (query.cols <= 64)
|
||||
{
|
||||
calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
|
||||
@ -1005,7 +1093,7 @@ namespace cv { namespace gpu { namespace device
|
||||
s_trainIdx[threadIdx.x] = bestIdx;
|
||||
__syncthreads();
|
||||
|
||||
reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
|
||||
reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
@ -1034,7 +1122,7 @@ namespace cv { namespace gpu { namespace device
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream)
|
||||
void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream)
|
||||
{
|
||||
findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
|
||||
}
|
||||
@ -1045,16 +1133,16 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (k == 2)
|
||||
{
|
||||
match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
|
||||
match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
|
||||
findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
|
||||
calcDistanceDispatcher<Dist>(query, train, mask, allDist, stream);
|
||||
findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1063,103 +1151,103 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
|
||||
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
|
||||
else
|
||||
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
|
||||
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
|
||||
}
|
||||
|
||||
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
|
||||
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
|
||||
else
|
||||
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
|
||||
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
|
||||
}
|
||||
|
||||
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
|
||||
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
|
||||
else
|
||||
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
|
||||
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
|
||||
}
|
||||
|
||||
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
|
||||
|
||||
template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (masks.data)
|
||||
match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
|
||||
match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
|
||||
else
|
||||
match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
|
||||
match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
|
||||
}
|
||||
|
||||
template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
//template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
template void match2L1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
//template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
template void match2L1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
|
||||
template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (masks.data)
|
||||
match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
|
||||
match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
|
||||
else
|
||||
match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
|
||||
match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
|
||||
}
|
||||
|
||||
//template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
//template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
//template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
//template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
//template void match2L2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
//template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
//template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
//template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
//template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
//template void match2L2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
|
||||
template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (masks.data)
|
||||
match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
|
||||
match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
|
||||
else
|
||||
match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
|
||||
match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
|
||||
}
|
||||
|
||||
template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
//template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
//template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
template void match2Hamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
|
||||
template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
//template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
//template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
template void match2Hamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
|
||||
} // namespace bf_knnmatch
|
||||
}}} // namespace cv { namespace gpu { namespace device {
|
||||
|
||||
|
@ -42,7 +42,9 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/utility.hpp"
|
||||
#include "opencv2/gpu/device/reduce.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/vec_distance.hpp"
|
||||
#include "opencv2/gpu/device/datamov_utils.hpp"
|
||||
@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device
|
||||
s_distance += threadIdx.y * BLOCK_SIZE;
|
||||
s_trainIdx += threadIdx.y * BLOCK_SIZE;
|
||||
|
||||
s_distance[threadIdx.x] = bestDistance;
|
||||
s_trainIdx[threadIdx.x] = bestTrainIdx;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
|
||||
reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
|
||||
}
|
||||
|
||||
template <int BLOCK_SIZE>
|
||||
@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device
|
||||
s_trainIdx += threadIdx.y * BLOCK_SIZE;
|
||||
s_imgIdx += threadIdx.y * BLOCK_SIZE;
|
||||
|
||||
s_distance[threadIdx.x] = bestDistance;
|
||||
s_trainIdx[threadIdx.x] = bestTrainIdx;
|
||||
s_imgIdx [threadIdx.x] = bestImgIdx;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());
|
||||
reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@ -567,9 +558,8 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
(void)cc;
|
||||
if (query.cols <= 64)
|
||||
{
|
||||
matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
|
||||
@ -599,9 +589,8 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
(void)cc;
|
||||
if (query.cols <= 64)
|
||||
{
|
||||
matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
|
||||
@ -633,151 +622,151 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
|
||||
trainIdx, distance,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
|
||||
trainIdx, distance,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
|
||||
trainIdx, distance,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
|
||||
trainIdx, distance,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
|
||||
trainIdx, distance,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
|
||||
trainIdx, distance,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (masks.data)
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
|
||||
trainIdx, imgIdx, distance,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
|
||||
trainIdx, imgIdx, distance,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
//template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (masks.data)
|
||||
{
|
||||
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
|
||||
trainIdx, imgIdx, distance,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
|
||||
trainIdx, imgIdx, distance,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
//template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
//template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
//template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
//template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
//template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (masks.data)
|
||||
{
|
||||
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
|
||||
trainIdx, imgIdx, distance,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
|
||||
trainIdx, imgIdx, distance,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
|
||||
} // namespace bf_match
|
||||
}}} // namespace cv { namespace gpu { namespace device {
|
||||
|
||||
|
@ -42,7 +42,8 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/utility.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/vec_distance.hpp"
|
||||
#include "opencv2/gpu/device/datamov_utils.hpp"
|
||||
@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device
|
||||
__global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
|
||||
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
|
||||
{
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
|
||||
|
||||
extern __shared__ int smem[];
|
||||
|
||||
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
|
||||
@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device
|
||||
bestDistance.ptr(queryIdx)[ind] = distVal;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
||||
@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device
|
||||
__global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
|
||||
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
|
||||
{
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
|
||||
|
||||
extern __shared__ int smem[];
|
||||
|
||||
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
|
||||
@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device
|
||||
bestDistance.ptr(queryIdx)[ind] = distVal;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
|
||||
@ -281,9 +274,8 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
(void)cc;
|
||||
if (query.cols <= 64)
|
||||
{
|
||||
matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
|
||||
@ -313,9 +305,8 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename Dist, typename T>
|
||||
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
(void)cc;
|
||||
if (query.cols <= 64)
|
||||
{
|
||||
matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
|
||||
@ -347,124 +338,124 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
|
||||
trainIdx, imgIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
|
||||
template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
//template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
|
||||
trainIdx, imgIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
|
||||
//template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
//template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
//template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
//template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
//template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
|
||||
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
|
||||
trainIdx, imgIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
stream);
|
||||
}
|
||||
|
||||
template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
|
||||
} // namespace bf_radius_match
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
|
@ -42,9 +42,10 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/transform.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
#include "opencv2/gpu/device/reduce.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device
|
||||
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
|
||||
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
|
||||
}
|
||||
__device__ __forceinline__ TransformOp() {}
|
||||
__device__ __forceinline__ TransformOp(const TransformOp&) {}
|
||||
};
|
||||
|
||||
void call(const PtrStepSz<float3> src, const float* rot,
|
||||
@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device
|
||||
(cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
|
||||
(cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
|
||||
}
|
||||
__device__ __forceinline__ ProjectOp() {}
|
||||
__device__ __forceinline__ ProjectOp(const ProjectOp&) {}
|
||||
};
|
||||
|
||||
void call(const PtrStepSz<float3> src, const float* rot,
|
||||
@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device
|
||||
return x * x;
|
||||
}
|
||||
|
||||
template <int BLOCK_SIZE>
|
||||
__global__ void computeHypothesisScoresKernel(
|
||||
const int num_points, const float3* object, const float2* image,
|
||||
const float dist_threshold, int* g_num_inliers)
|
||||
@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device
|
||||
++num_inliers;
|
||||
}
|
||||
|
||||
extern __shared__ float s_num_inliers[];
|
||||
s_num_inliers[threadIdx.x] = num_inliers;
|
||||
__syncthreads();
|
||||
|
||||
for (int step = blockDim.x / 2; step > 0; step >>= 1)
|
||||
{
|
||||
if (threadIdx.x < step)
|
||||
s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
|
||||
__syncthreads();
|
||||
}
|
||||
__shared__ int s_num_inliers[BLOCK_SIZE];
|
||||
reduce<BLOCK_SIZE>(s_num_inliers, num_inliers, threadIdx.x, plus<int>());
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
g_num_inliers[blockIdx.x] = s_num_inliers[0];
|
||||
g_num_inliers[blockIdx.x] = num_inliers;
|
||||
}
|
||||
|
||||
void computeHypothesisScores(
|
||||
@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
dim3 threads(256);
|
||||
dim3 grid(num_hypotheses);
|
||||
int smem_size = threads.x * sizeof(float);
|
||||
|
||||
computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
|
||||
computeHypothesisScoresKernel<256><<<grid, threads>>>(
|
||||
num_points, object, image, dist_threshold, hypothesis_scores);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
|
@ -43,459 +43,451 @@
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/emulation.hpp"
|
||||
#include "opencv2/gpu/device/transform.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
#include "opencv2/gpu/device/utility.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace canny
|
||||
{
|
||||
struct L1 : binary_function<int, int, float>
|
||||
{
|
||||
__device__ __forceinline__ float operator ()(int x, int y) const
|
||||
{
|
||||
return ::abs(x) + ::abs(y);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ L1() {}
|
||||
__device__ __forceinline__ L1(const L1&) {}
|
||||
};
|
||||
struct L2 : binary_function<int, int, float>
|
||||
{
|
||||
__device__ __forceinline__ float operator ()(int x, int y) const
|
||||
{
|
||||
return ::sqrtf(x * x + y * y);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ L2() {}
|
||||
__device__ __forceinline__ L2(const L2&) {}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace canny
|
||||
template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
|
||||
{
|
||||
__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace canny
|
||||
{
|
||||
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
struct SrcTex
|
||||
{
|
||||
const int xoff;
|
||||
const int yoff;
|
||||
__host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
|
||||
|
||||
__device__ __forceinline__ int operator ()(int y, int x) const
|
||||
{
|
||||
__shared__ int smem[16][18];
|
||||
return tex2D(tex_src, x + xoff, y + yoff);
|
||||
}
|
||||
};
|
||||
|
||||
const int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
template <class Norm> __global__
|
||||
void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (i < rows)
|
||||
{
|
||||
smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
|
||||
smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
|
||||
}
|
||||
__syncthreads();
|
||||
if (y >= mag.rows || x >= mag.cols)
|
||||
return;
|
||||
|
||||
if (j < cols)
|
||||
{
|
||||
dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
|
||||
dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
|
||||
}
|
||||
}
|
||||
int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
|
||||
int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
|
||||
|
||||
dx(y, x) = dxVal;
|
||||
dy(y, x) = dyVal;
|
||||
|
||||
mag(y, x) = norm(dxVal, dyVal);
|
||||
}
|
||||
|
||||
void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
|
||||
{
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
|
||||
|
||||
bindTexture(&tex_src, srcWhole);
|
||||
SrcTex src(xoff, yoff);
|
||||
|
||||
if (L2Grad)
|
||||
{
|
||||
L2 norm;
|
||||
calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
|
||||
}
|
||||
else
|
||||
{
|
||||
L1 norm;
|
||||
calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
|
||||
}
|
||||
|
||||
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
|
||||
{
|
||||
if (L2Grad)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
L2 norm;
|
||||
transform(dx, dy, mag, norm, WithOutMask(), 0);
|
||||
}
|
||||
|
||||
struct L1
|
||||
else
|
||||
{
|
||||
static __device__ __forceinline__ float calc(int x, int y)
|
||||
{
|
||||
return ::abs(x) + ::abs(y);
|
||||
}
|
||||
};
|
||||
struct L2
|
||||
{
|
||||
static __device__ __forceinline__ float calc(int x, int y)
|
||||
{
|
||||
return ::sqrtf(x * x + y * y);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
|
||||
PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
|
||||
{
|
||||
__shared__ int sdx[18][16];
|
||||
__shared__ int sdy[18][16];
|
||||
|
||||
const int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (j < cols)
|
||||
{
|
||||
sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
|
||||
sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
|
||||
if (threadIdx.y == 0)
|
||||
{
|
||||
sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
|
||||
sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
|
||||
|
||||
sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
|
||||
sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (i < rows)
|
||||
{
|
||||
int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
|
||||
int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
|
||||
|
||||
dx.ptr(i)[j] = x;
|
||||
dy.ptr(i)[j] = y;
|
||||
|
||||
mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
|
||||
}
|
||||
}
|
||||
L1 norm;
|
||||
transform(dx, dy, mag, norm, WithOutMask(), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace canny
|
||||
{
|
||||
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
|
||||
__global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
|
||||
{
|
||||
const int CANNY_SHIFT = 15;
|
||||
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
|
||||
return;
|
||||
|
||||
int dxVal = dx(y, x);
|
||||
int dyVal = dy(y, x);
|
||||
|
||||
const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
|
||||
const float m = tex2D(tex_mag, x, y);
|
||||
|
||||
dxVal = ::abs(dxVal);
|
||||
dyVal = ::abs(dyVal);
|
||||
|
||||
// 0 - the pixel can not belong to an edge
|
||||
// 1 - the pixel might belong to an edge
|
||||
// 2 - the pixel does belong to an edge
|
||||
int edge_type = 0;
|
||||
|
||||
if (m > low_thresh)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
const int tg22x = dxVal * TG22;
|
||||
const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
|
||||
|
||||
if (L2Grad)
|
||||
calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
|
||||
dyVal <<= CANNY_SHIFT;
|
||||
|
||||
if (dyVal < tg22x)
|
||||
{
|
||||
if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
else if(dyVal > tg67x)
|
||||
{
|
||||
if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
else
|
||||
calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
{
|
||||
if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
|
||||
{
|
||||
const int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
map(y, x) = edge_type;
|
||||
}
|
||||
|
||||
if (i < rows && j < cols)
|
||||
mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
|
||||
void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
|
||||
{
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
|
||||
|
||||
bindTexture(&tex_mag, mag);
|
||||
|
||||
calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace canny
|
||||
{
|
||||
__device__ int counter = 0;
|
||||
|
||||
__global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
|
||||
{
|
||||
__shared__ volatile int smem[18][18];
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
|
||||
if (threadIdx.y == 0)
|
||||
smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
|
||||
if (threadIdx.y == blockDim.y - 1)
|
||||
smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
|
||||
if (threadIdx.x == 0)
|
||||
smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
|
||||
if (threadIdx.x == blockDim.x - 1)
|
||||
smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0)
|
||||
smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
|
||||
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
|
||||
smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
|
||||
if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
|
||||
smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
|
||||
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
|
||||
smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (x >= map.cols || y >= map.rows)
|
||||
return;
|
||||
|
||||
int n;
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < 16; ++k)
|
||||
{
|
||||
n = 0;
|
||||
|
||||
if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
|
||||
{
|
||||
n += smem[threadIdx.y ][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
|
||||
|
||||
n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
|
||||
|
||||
n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
|
||||
}
|
||||
|
||||
if (n > 0)
|
||||
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
|
||||
}
|
||||
|
||||
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
|
||||
const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
|
||||
|
||||
map(y, x) = e;
|
||||
|
||||
n = 0;
|
||||
|
||||
if (e == 2)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
n += smem[threadIdx.y ][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
|
||||
|
||||
if (L2Grad)
|
||||
calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
|
||||
else
|
||||
calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
|
||||
n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define CANNY_SHIFT 15
|
||||
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
|
||||
|
||||
__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
|
||||
if (n > 0)
|
||||
{
|
||||
__shared__ float smem[18][18];
|
||||
const int ind = ::atomicAdd(&counter, 1);
|
||||
st[ind] = make_ushort2(x, y);
|
||||
}
|
||||
}
|
||||
|
||||
const int j = blockIdx.x * 16 + threadIdx.x;
|
||||
const int i = blockIdx.y * 16 + threadIdx.y;
|
||||
void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
|
||||
|
||||
const int tid = threadIdx.y * 16 + threadIdx.x;
|
||||
const int lx = tid % 18;
|
||||
const int ly = tid / 18;
|
||||
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
|
||||
|
||||
if (ly < 14)
|
||||
smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
|
||||
|
||||
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
|
||||
smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
|
||||
edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace canny
|
||||
{
|
||||
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
|
||||
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
|
||||
|
||||
__global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
|
||||
{
|
||||
const int stack_size = 512;
|
||||
|
||||
__shared__ int s_counter;
|
||||
__shared__ int s_ind;
|
||||
__shared__ ushort2 s_st[stack_size];
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
s_counter = 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int ind = blockIdx.y * gridDim.x + blockIdx.x;
|
||||
|
||||
if (ind >= count)
|
||||
return;
|
||||
|
||||
ushort2 pos = st1[ind];
|
||||
|
||||
if (threadIdx.x < 8)
|
||||
{
|
||||
pos.x += c_dx[threadIdx.x];
|
||||
pos.y += c_dy[threadIdx.x];
|
||||
|
||||
if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
|
||||
{
|
||||
map(pos.y, pos.x) = 2;
|
||||
|
||||
ind = Emulation::smem::atomicAdd(&s_counter, 1);
|
||||
|
||||
s_st[ind] = pos;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
|
||||
{
|
||||
const int subTaskIdx = threadIdx.x >> 3;
|
||||
const int portion = ::min(s_counter, blockDim.x >> 3);
|
||||
|
||||
if (subTaskIdx < portion)
|
||||
pos = s_st[s_counter - 1 - subTaskIdx];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (i < rows && j < cols)
|
||||
{
|
||||
int x = dx.ptr(i)[j];
|
||||
int y = dy.ptr(i)[j];
|
||||
const int s = (x ^ y) < 0 ? -1 : 1;
|
||||
const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
|
||||
|
||||
x = ::abs(x);
|
||||
y = ::abs(y);
|
||||
|
||||
// 0 - the pixel can not belong to an edge
|
||||
// 1 - the pixel might belong to an edge
|
||||
// 2 - the pixel does belong to an edge
|
||||
int edge_type = 0;
|
||||
|
||||
if (m > low_thresh)
|
||||
{
|
||||
const int tg22x = x * TG22;
|
||||
const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
|
||||
|
||||
y <<= CANNY_SHIFT;
|
||||
|
||||
if (y < tg22x)
|
||||
{
|
||||
if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
else if( y > tg67x )
|
||||
{
|
||||
if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
}
|
||||
|
||||
map.ptr(i + 1)[j + 1] = edge_type;
|
||||
}
|
||||
}
|
||||
|
||||
#undef CANNY_SHIFT
|
||||
#undef TG22
|
||||
|
||||
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__device__ unsigned int counter = 0;
|
||||
|
||||
__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
|
||||
{
|
||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120)
|
||||
|
||||
__shared__ int smem[18][18];
|
||||
|
||||
const int j = blockIdx.x * 16 + threadIdx.x;
|
||||
const int i = blockIdx.y * 16 + threadIdx.y;
|
||||
|
||||
const int tid = threadIdx.y * 16 + threadIdx.x;
|
||||
const int lx = tid % 18;
|
||||
const int ly = tid / 18;
|
||||
|
||||
if (ly < 14)
|
||||
smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
|
||||
|
||||
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
|
||||
smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (i < rows && j < cols)
|
||||
{
|
||||
int n;
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < 16; ++k)
|
||||
{
|
||||
n = 0;
|
||||
|
||||
if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
|
||||
{
|
||||
n += smem[threadIdx.y ][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
|
||||
|
||||
n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
|
||||
|
||||
n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
|
||||
}
|
||||
|
||||
if (n > 0)
|
||||
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
|
||||
}
|
||||
|
||||
const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
|
||||
|
||||
map.ptr(i + 1)[j + 1] = e;
|
||||
|
||||
n = 0;
|
||||
|
||||
if (e == 2)
|
||||
{
|
||||
n += smem[threadIdx.y ][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
|
||||
|
||||
n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
|
||||
|
||||
n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
|
||||
}
|
||||
|
||||
if (n > 0)
|
||||
{
|
||||
const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
|
||||
st[ind] = make_ushort2(j + 1, i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
|
||||
|
||||
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
|
||||
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
|
||||
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
|
||||
|
||||
__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
|
||||
{
|
||||
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120
|
||||
|
||||
const int stack_size = 512;
|
||||
|
||||
__shared__ unsigned int s_counter;
|
||||
__shared__ unsigned int s_ind;
|
||||
__shared__ ushort2 s_st[stack_size];
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
s_counter = 0;
|
||||
s_counter -= portion;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int ind = blockIdx.y * gridDim.x + blockIdx.x;
|
||||
|
||||
if (ind < count)
|
||||
if (subTaskIdx < portion)
|
||||
{
|
||||
ushort2 pos = st1[ind];
|
||||
pos.x += c_dx[threadIdx.x & 7];
|
||||
pos.y += c_dy[threadIdx.x & 7];
|
||||
|
||||
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
|
||||
if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
|
||||
{
|
||||
if (threadIdx.x < 8)
|
||||
{
|
||||
pos.x += c_dx[threadIdx.x];
|
||||
pos.y += c_dy[threadIdx.x];
|
||||
map(pos.y, pos.x) = 2;
|
||||
|
||||
if (map.ptr(pos.y)[pos.x] == 1)
|
||||
{
|
||||
map.ptr(pos.y)[pos.x] = 2;
|
||||
ind = Emulation::smem::atomicAdd(&s_counter, 1);
|
||||
|
||||
ind = atomicInc(&s_counter, (unsigned int)(-1));
|
||||
|
||||
s_st[ind] = pos;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
|
||||
{
|
||||
const int subTaskIdx = threadIdx.x >> 3;
|
||||
const int portion = ::min(s_counter, blockDim.x >> 3);
|
||||
|
||||
pos.x = pos.y = 0;
|
||||
|
||||
if (subTaskIdx < portion)
|
||||
pos = s_st[s_counter - 1 - subTaskIdx];
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
s_counter -= portion;
|
||||
__syncthreads();
|
||||
|
||||
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
|
||||
{
|
||||
pos.x += c_dx[threadIdx.x & 7];
|
||||
pos.y += c_dy[threadIdx.x & 7];
|
||||
|
||||
if (map.ptr(pos.y)[pos.x] == 1)
|
||||
{
|
||||
map.ptr(pos.y)[pos.x] = 2;
|
||||
|
||||
ind = atomicInc(&s_counter, (unsigned int)(-1));
|
||||
|
||||
s_st[ind] = pos;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (s_counter > 0)
|
||||
{
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
ind = atomicAdd(&counter, s_counter);
|
||||
s_ind = ind - s_counter;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
ind = s_ind;
|
||||
|
||||
for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
|
||||
{
|
||||
st2[ind + i] = s_st[i];
|
||||
}
|
||||
}
|
||||
s_st[ind] = pos;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
|
||||
if (s_counter > 0)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
|
||||
|
||||
unsigned int count;
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
|
||||
while (count > 0)
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
|
||||
|
||||
dim3 block(128, 1, 1);
|
||||
dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
|
||||
edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
|
||||
std::swap(st1, st2);
|
||||
ind = ::atomicAdd(&counter, s_counter);
|
||||
s_ind = ind - s_counter;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
ind = s_ind;
|
||||
|
||||
for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
|
||||
st2[ind + i] = s_st[i];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
|
||||
void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
|
||||
|
||||
int count;
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
|
||||
|
||||
while (count > 0)
|
||||
{
|
||||
const int j = blockIdx.x * 16 + threadIdx.x;
|
||||
const int i = blockIdx.y * 16 + threadIdx.y;
|
||||
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
|
||||
|
||||
if (i < rows && j < cols)
|
||||
dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
|
||||
}
|
||||
const dim3 block(128);
|
||||
const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
|
||||
|
||||
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
getEdges<<<grid, block>>>(map, dst, rows, cols);
|
||||
edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
} // namespace canny
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
|
||||
|
||||
std::swap(st1, st2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace canny
|
||||
{
|
||||
struct GetEdges : unary_function<int, uchar>
|
||||
{
|
||||
__device__ __forceinline__ uchar operator ()(int e) const
|
||||
{
|
||||
return (uchar)(-(e >> 1));
|
||||
}
|
||||
|
||||
__device__ __forceinline__ GetEdges() {}
|
||||
__device__ __forceinline__ GetEdges(const GetEdges&) {}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace canny
|
||||
{
|
||||
void getEdges(PtrStepSzi map, PtrStepSzb dst)
|
||||
{
|
||||
transform(map, dst, GetEdges(), WithOutMask(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
@ -497,6 +497,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
|
||||
{
|
||||
(void) flags;
|
||||
dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
|
||||
dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
|
||||
|
||||
|
@ -42,10 +42,10 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include <internal_shared.hpp>
|
||||
#include <opencv2/gpu/device/transform.hpp>
|
||||
#include <opencv2/gpu/device/color.hpp>
|
||||
#include <cvt_colot_internal.h>
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/transform.hpp"
|
||||
#include "opencv2/gpu/device/color.hpp"
|
||||
#include "cvt_color_internal.h"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@ -224,7 +224,7 @@ namespace cv { namespace gpu { namespace device
|
||||
};
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
|
||||
void name(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream) \
|
||||
void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) \
|
||||
{ \
|
||||
traits::functor_type functor = traits::create_functor(); \
|
||||
typedef typename traits::functor_type::argument_type src_t; \
|
||||
@ -241,6 +241,10 @@ namespace cv { namespace gpu { namespace device
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(name) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
|
||||
@ -339,46 +343,119 @@ namespace cv { namespace gpu { namespace device
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgra)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.0.cu
Normal file
53
modules/gpu/src/cuda/column_filter.0.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float, uchar>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.1.cu
Normal file
53
modules/gpu/src/cuda/column_filter.1.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.10.cu
Normal file
53
modules/gpu/src/cuda/column_filter.10.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float, unsigned short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.11.cu
Normal file
53
modules/gpu/src/cuda/column_filter.11.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float3, ushort3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.12.cu
Normal file
53
modules/gpu/src/cuda/column_filter.12.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float4, ushort4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.13.cu
Normal file
53
modules/gpu/src/cuda/column_filter.13.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float3, int3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.14.cu
Normal file
53
modules/gpu/src/cuda/column_filter.14.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float4, int4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.2.cu
Normal file
53
modules/gpu/src/cuda/column_filter.2.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.3.cu
Normal file
53
modules/gpu/src/cuda/column_filter.3.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.4.cu
Normal file
53
modules/gpu/src/cuda/column_filter.4.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float, int>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.5.cu
Normal file
53
modules/gpu/src/cuda/column_filter.5.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.6.cu
Normal file
53
modules/gpu/src/cuda/column_filter.6.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.7.cu
Normal file
53
modules/gpu/src/cuda/column_filter.7.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.8.cu
Normal file
53
modules/gpu/src/cuda/column_filter.8.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float, short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/column_filter.9.cu
Normal file
53
modules/gpu/src/cuda/column_filter.9.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "column_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearColumn<float4, short4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
@ -1,391 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
#include "opencv2/gpu/device/static_check.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace column_filter
|
||||
{
|
||||
#define MAX_KERNEL_SIZE 32
|
||||
|
||||
__constant__ float c_kernel[MAX_KERNEL_SIZE];
|
||||
|
||||
void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
|
||||
{
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
|
||||
else
|
||||
cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
|
||||
}
|
||||
|
||||
template <int KSIZE, typename T, typename D, typename B>
|
||||
__global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
|
||||
{
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
|
||||
const int BLOCK_DIM_X = 16;
|
||||
const int BLOCK_DIM_Y = 16;
|
||||
const int PATCH_PER_BLOCK = 4;
|
||||
const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
|
||||
#else
|
||||
const int BLOCK_DIM_X = 16;
|
||||
const int BLOCK_DIM_Y = 8;
|
||||
const int PATCH_PER_BLOCK = 2;
|
||||
const int HALO_SIZE = 2;
|
||||
#endif
|
||||
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
|
||||
|
||||
__shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
|
||||
|
||||
const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
|
||||
|
||||
if (x >= src.cols)
|
||||
return;
|
||||
|
||||
const T* src_col = src.ptr() + x;
|
||||
|
||||
const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
|
||||
|
||||
if (blockIdx.y > 0)
|
||||
{
|
||||
//Upper halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
|
||||
}
|
||||
else
|
||||
{
|
||||
//Upper halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
|
||||
}
|
||||
|
||||
if (blockIdx.y + 2 < gridDim.y)
|
||||
{
|
||||
//Main data
|
||||
#pragma unroll
|
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
|
||||
smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
|
||||
|
||||
//Lower halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
|
||||
}
|
||||
else
|
||||
{
|
||||
//Main data
|
||||
#pragma unroll
|
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
|
||||
smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
|
||||
|
||||
//Lower halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
|
||||
{
|
||||
const int y = yStart + j * BLOCK_DIM_Y;
|
||||
|
||||
if (y < src.rows)
|
||||
{
|
||||
sum_t sum = VecTraits<sum_t>::all(0);
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < KSIZE; ++k)
|
||||
sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
|
||||
|
||||
dst(y, x) = saturate_cast<D>(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int KSIZE, typename T, typename D, template<typename> class B>
|
||||
void linearColumnFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
|
||||
{
|
||||
int BLOCK_DIM_X;
|
||||
int BLOCK_DIM_Y;
|
||||
int PATCH_PER_BLOCK;
|
||||
|
||||
if (cc >= 20)
|
||||
{
|
||||
BLOCK_DIM_X = 16;
|
||||
BLOCK_DIM_Y = 16;
|
||||
PATCH_PER_BLOCK = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
BLOCK_DIM_X = 16;
|
||||
BLOCK_DIM_Y = 8;
|
||||
PATCH_PER_BLOCK = 2;
|
||||
}
|
||||
|
||||
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
|
||||
|
||||
B<T> brd(src.rows);
|
||||
|
||||
linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, typename D>
|
||||
void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[5][33] =
|
||||
{
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller< 1, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller< 2, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller< 3, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller< 4, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller< 5, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller< 6, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller< 7, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller< 8, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller< 9, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<17, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<18, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<19, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<20, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<21, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<22, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<23, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<24, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<25, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<26, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<27, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<28, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<29, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<30, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<31, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<32, T, D, BrdColReflect101>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller< 1, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller< 2, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller< 3, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller< 4, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller< 5, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller< 6, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller< 7, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller< 8, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller< 9, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<17, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<18, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<19, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<20, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<21, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<22, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<23, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<24, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<25, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<26, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<27, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<28, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<29, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<30, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<31, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<32, T, D, BrdColReplicate>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller< 1, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller< 2, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller< 3, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller< 4, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller< 5, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller< 6, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller< 7, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller< 8, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller< 9, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<17, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<18, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<19, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<20, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<21, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<22, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<23, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<24, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<25, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<26, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<27, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<28, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<29, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<30, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<31, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<32, T, D, BrdColConstant>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller< 1, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller< 2, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller< 3, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller< 4, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller< 5, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller< 6, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller< 7, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller< 8, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller< 9, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<17, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<18, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<19, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<20, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<21, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<22, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<23, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<24, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<25, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<26, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<27, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<28, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<29, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<30, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<31, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<32, T, D, BrdColReflect>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller< 1, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller< 2, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller< 3, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller< 4, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller< 5, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller< 6, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller< 7, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller< 8, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller< 9, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<17, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<18, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<19, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<20, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<21, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<22, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<23, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<24, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<25, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<26, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<27, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<28, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<29, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<30, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<31, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<32, T, D, BrdColWrap>
|
||||
}
|
||||
};
|
||||
|
||||
loadKernel(kernel, ksize, stream);
|
||||
|
||||
callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
|
||||
}
|
||||
|
||||
template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , int >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
} // namespace column_filter
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
373
modules/gpu/src/cuda/column_filter.h
Normal file
373
modules/gpu/src/cuda/column_filter.h
Normal file
@ -0,0 +1,373 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace column_filter
|
||||
{
|
||||
#define MAX_KERNEL_SIZE 32
|
||||
|
||||
__constant__ float c_kernel[MAX_KERNEL_SIZE];
|
||||
|
||||
template <int KSIZE, typename T, typename D, typename B>
|
||||
__global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
|
||||
{
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
|
||||
const int BLOCK_DIM_X = 16;
|
||||
const int BLOCK_DIM_Y = 16;
|
||||
const int PATCH_PER_BLOCK = 4;
|
||||
const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
|
||||
#else
|
||||
const int BLOCK_DIM_X = 16;
|
||||
const int BLOCK_DIM_Y = 8;
|
||||
const int PATCH_PER_BLOCK = 2;
|
||||
const int HALO_SIZE = 2;
|
||||
#endif
|
||||
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
|
||||
|
||||
__shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
|
||||
|
||||
const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
|
||||
|
||||
if (x >= src.cols)
|
||||
return;
|
||||
|
||||
const T* src_col = src.ptr() + x;
|
||||
|
||||
const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
|
||||
|
||||
if (blockIdx.y > 0)
|
||||
{
|
||||
//Upper halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
|
||||
}
|
||||
else
|
||||
{
|
||||
//Upper halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
|
||||
}
|
||||
|
||||
if (blockIdx.y + 2 < gridDim.y)
|
||||
{
|
||||
//Main data
|
||||
#pragma unroll
|
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
|
||||
smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
|
||||
|
||||
//Lower halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
|
||||
}
|
||||
else
|
||||
{
|
||||
//Main data
|
||||
#pragma unroll
|
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
|
||||
smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
|
||||
|
||||
//Lower halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
|
||||
{
|
||||
const int y = yStart + j * BLOCK_DIM_Y;
|
||||
|
||||
if (y < src.rows)
|
||||
{
|
||||
sum_t sum = VecTraits<sum_t>::all(0);
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < KSIZE; ++k)
|
||||
sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
|
||||
|
||||
dst(y, x) = saturate_cast<D>(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int KSIZE, typename T, typename D, template<typename> class B>
|
||||
void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
|
||||
{
|
||||
int BLOCK_DIM_X;
|
||||
int BLOCK_DIM_Y;
|
||||
int PATCH_PER_BLOCK;
|
||||
|
||||
if (cc >= 20)
|
||||
{
|
||||
BLOCK_DIM_X = 16;
|
||||
BLOCK_DIM_Y = 16;
|
||||
PATCH_PER_BLOCK = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
BLOCK_DIM_X = 16;
|
||||
BLOCK_DIM_Y = 8;
|
||||
PATCH_PER_BLOCK = 2;
|
||||
}
|
||||
|
||||
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
|
||||
|
||||
B<T> brd(src.rows);
|
||||
|
||||
linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template <typename T, typename D>
|
||||
void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[5][33] =
|
||||
{
|
||||
{
|
||||
0,
|
||||
column_filter::caller< 1, T, D, BrdColReflect101>,
|
||||
column_filter::caller< 2, T, D, BrdColReflect101>,
|
||||
column_filter::caller< 3, T, D, BrdColReflect101>,
|
||||
column_filter::caller< 4, T, D, BrdColReflect101>,
|
||||
column_filter::caller< 5, T, D, BrdColReflect101>,
|
||||
column_filter::caller< 6, T, D, BrdColReflect101>,
|
||||
column_filter::caller< 7, T, D, BrdColReflect101>,
|
||||
column_filter::caller< 8, T, D, BrdColReflect101>,
|
||||
column_filter::caller< 9, T, D, BrdColReflect101>,
|
||||
column_filter::caller<10, T, D, BrdColReflect101>,
|
||||
column_filter::caller<11, T, D, BrdColReflect101>,
|
||||
column_filter::caller<12, T, D, BrdColReflect101>,
|
||||
column_filter::caller<13, T, D, BrdColReflect101>,
|
||||
column_filter::caller<14, T, D, BrdColReflect101>,
|
||||
column_filter::caller<15, T, D, BrdColReflect101>,
|
||||
column_filter::caller<16, T, D, BrdColReflect101>,
|
||||
column_filter::caller<17, T, D, BrdColReflect101>,
|
||||
column_filter::caller<18, T, D, BrdColReflect101>,
|
||||
column_filter::caller<19, T, D, BrdColReflect101>,
|
||||
column_filter::caller<20, T, D, BrdColReflect101>,
|
||||
column_filter::caller<21, T, D, BrdColReflect101>,
|
||||
column_filter::caller<22, T, D, BrdColReflect101>,
|
||||
column_filter::caller<23, T, D, BrdColReflect101>,
|
||||
column_filter::caller<24, T, D, BrdColReflect101>,
|
||||
column_filter::caller<25, T, D, BrdColReflect101>,
|
||||
column_filter::caller<26, T, D, BrdColReflect101>,
|
||||
column_filter::caller<27, T, D, BrdColReflect101>,
|
||||
column_filter::caller<28, T, D, BrdColReflect101>,
|
||||
column_filter::caller<29, T, D, BrdColReflect101>,
|
||||
column_filter::caller<30, T, D, BrdColReflect101>,
|
||||
column_filter::caller<31, T, D, BrdColReflect101>,
|
||||
column_filter::caller<32, T, D, BrdColReflect101>
|
||||
},
|
||||
{
|
||||
0,
|
||||
column_filter::caller< 1, T, D, BrdColReplicate>,
|
||||
column_filter::caller< 2, T, D, BrdColReplicate>,
|
||||
column_filter::caller< 3, T, D, BrdColReplicate>,
|
||||
column_filter::caller< 4, T, D, BrdColReplicate>,
|
||||
column_filter::caller< 5, T, D, BrdColReplicate>,
|
||||
column_filter::caller< 6, T, D, BrdColReplicate>,
|
||||
column_filter::caller< 7, T, D, BrdColReplicate>,
|
||||
column_filter::caller< 8, T, D, BrdColReplicate>,
|
||||
column_filter::caller< 9, T, D, BrdColReplicate>,
|
||||
column_filter::caller<10, T, D, BrdColReplicate>,
|
||||
column_filter::caller<11, T, D, BrdColReplicate>,
|
||||
column_filter::caller<12, T, D, BrdColReplicate>,
|
||||
column_filter::caller<13, T, D, BrdColReplicate>,
|
||||
column_filter::caller<14, T, D, BrdColReplicate>,
|
||||
column_filter::caller<15, T, D, BrdColReplicate>,
|
||||
column_filter::caller<16, T, D, BrdColReplicate>,
|
||||
column_filter::caller<17, T, D, BrdColReplicate>,
|
||||
column_filter::caller<18, T, D, BrdColReplicate>,
|
||||
column_filter::caller<19, T, D, BrdColReplicate>,
|
||||
column_filter::caller<20, T, D, BrdColReplicate>,
|
||||
column_filter::caller<21, T, D, BrdColReplicate>,
|
||||
column_filter::caller<22, T, D, BrdColReplicate>,
|
||||
column_filter::caller<23, T, D, BrdColReplicate>,
|
||||
column_filter::caller<24, T, D, BrdColReplicate>,
|
||||
column_filter::caller<25, T, D, BrdColReplicate>,
|
||||
column_filter::caller<26, T, D, BrdColReplicate>,
|
||||
column_filter::caller<27, T, D, BrdColReplicate>,
|
||||
column_filter::caller<28, T, D, BrdColReplicate>,
|
||||
column_filter::caller<29, T, D, BrdColReplicate>,
|
||||
column_filter::caller<30, T, D, BrdColReplicate>,
|
||||
column_filter::caller<31, T, D, BrdColReplicate>,
|
||||
column_filter::caller<32, T, D, BrdColReplicate>
|
||||
},
|
||||
{
|
||||
0,
|
||||
column_filter::caller< 1, T, D, BrdColConstant>,
|
||||
column_filter::caller< 2, T, D, BrdColConstant>,
|
||||
column_filter::caller< 3, T, D, BrdColConstant>,
|
||||
column_filter::caller< 4, T, D, BrdColConstant>,
|
||||
column_filter::caller< 5, T, D, BrdColConstant>,
|
||||
column_filter::caller< 6, T, D, BrdColConstant>,
|
||||
column_filter::caller< 7, T, D, BrdColConstant>,
|
||||
column_filter::caller< 8, T, D, BrdColConstant>,
|
||||
column_filter::caller< 9, T, D, BrdColConstant>,
|
||||
column_filter::caller<10, T, D, BrdColConstant>,
|
||||
column_filter::caller<11, T, D, BrdColConstant>,
|
||||
column_filter::caller<12, T, D, BrdColConstant>,
|
||||
column_filter::caller<13, T, D, BrdColConstant>,
|
||||
column_filter::caller<14, T, D, BrdColConstant>,
|
||||
column_filter::caller<15, T, D, BrdColConstant>,
|
||||
column_filter::caller<16, T, D, BrdColConstant>,
|
||||
column_filter::caller<17, T, D, BrdColConstant>,
|
||||
column_filter::caller<18, T, D, BrdColConstant>,
|
||||
column_filter::caller<19, T, D, BrdColConstant>,
|
||||
column_filter::caller<20, T, D, BrdColConstant>,
|
||||
column_filter::caller<21, T, D, BrdColConstant>,
|
||||
column_filter::caller<22, T, D, BrdColConstant>,
|
||||
column_filter::caller<23, T, D, BrdColConstant>,
|
||||
column_filter::caller<24, T, D, BrdColConstant>,
|
||||
column_filter::caller<25, T, D, BrdColConstant>,
|
||||
column_filter::caller<26, T, D, BrdColConstant>,
|
||||
column_filter::caller<27, T, D, BrdColConstant>,
|
||||
column_filter::caller<28, T, D, BrdColConstant>,
|
||||
column_filter::caller<29, T, D, BrdColConstant>,
|
||||
column_filter::caller<30, T, D, BrdColConstant>,
|
||||
column_filter::caller<31, T, D, BrdColConstant>,
|
||||
column_filter::caller<32, T, D, BrdColConstant>
|
||||
},
|
||||
{
|
||||
0,
|
||||
column_filter::caller< 1, T, D, BrdColReflect>,
|
||||
column_filter::caller< 2, T, D, BrdColReflect>,
|
||||
column_filter::caller< 3, T, D, BrdColReflect>,
|
||||
column_filter::caller< 4, T, D, BrdColReflect>,
|
||||
column_filter::caller< 5, T, D, BrdColReflect>,
|
||||
column_filter::caller< 6, T, D, BrdColReflect>,
|
||||
column_filter::caller< 7, T, D, BrdColReflect>,
|
||||
column_filter::caller< 8, T, D, BrdColReflect>,
|
||||
column_filter::caller< 9, T, D, BrdColReflect>,
|
||||
column_filter::caller<10, T, D, BrdColReflect>,
|
||||
column_filter::caller<11, T, D, BrdColReflect>,
|
||||
column_filter::caller<12, T, D, BrdColReflect>,
|
||||
column_filter::caller<13, T, D, BrdColReflect>,
|
||||
column_filter::caller<14, T, D, BrdColReflect>,
|
||||
column_filter::caller<15, T, D, BrdColReflect>,
|
||||
column_filter::caller<16, T, D, BrdColReflect>,
|
||||
column_filter::caller<17, T, D, BrdColReflect>,
|
||||
column_filter::caller<18, T, D, BrdColReflect>,
|
||||
column_filter::caller<19, T, D, BrdColReflect>,
|
||||
column_filter::caller<20, T, D, BrdColReflect>,
|
||||
column_filter::caller<21, T, D, BrdColReflect>,
|
||||
column_filter::caller<22, T, D, BrdColReflect>,
|
||||
column_filter::caller<23, T, D, BrdColReflect>,
|
||||
column_filter::caller<24, T, D, BrdColReflect>,
|
||||
column_filter::caller<25, T, D, BrdColReflect>,
|
||||
column_filter::caller<26, T, D, BrdColReflect>,
|
||||
column_filter::caller<27, T, D, BrdColReflect>,
|
||||
column_filter::caller<28, T, D, BrdColReflect>,
|
||||
column_filter::caller<29, T, D, BrdColReflect>,
|
||||
column_filter::caller<30, T, D, BrdColReflect>,
|
||||
column_filter::caller<31, T, D, BrdColReflect>,
|
||||
column_filter::caller<32, T, D, BrdColReflect>
|
||||
},
|
||||
{
|
||||
0,
|
||||
column_filter::caller< 1, T, D, BrdColWrap>,
|
||||
column_filter::caller< 2, T, D, BrdColWrap>,
|
||||
column_filter::caller< 3, T, D, BrdColWrap>,
|
||||
column_filter::caller< 4, T, D, BrdColWrap>,
|
||||
column_filter::caller< 5, T, D, BrdColWrap>,
|
||||
column_filter::caller< 6, T, D, BrdColWrap>,
|
||||
column_filter::caller< 7, T, D, BrdColWrap>,
|
||||
column_filter::caller< 8, T, D, BrdColWrap>,
|
||||
column_filter::caller< 9, T, D, BrdColWrap>,
|
||||
column_filter::caller<10, T, D, BrdColWrap>,
|
||||
column_filter::caller<11, T, D, BrdColWrap>,
|
||||
column_filter::caller<12, T, D, BrdColWrap>,
|
||||
column_filter::caller<13, T, D, BrdColWrap>,
|
||||
column_filter::caller<14, T, D, BrdColWrap>,
|
||||
column_filter::caller<15, T, D, BrdColWrap>,
|
||||
column_filter::caller<16, T, D, BrdColWrap>,
|
||||
column_filter::caller<17, T, D, BrdColWrap>,
|
||||
column_filter::caller<18, T, D, BrdColWrap>,
|
||||
column_filter::caller<19, T, D, BrdColWrap>,
|
||||
column_filter::caller<20, T, D, BrdColWrap>,
|
||||
column_filter::caller<21, T, D, BrdColWrap>,
|
||||
column_filter::caller<22, T, D, BrdColWrap>,
|
||||
column_filter::caller<23, T, D, BrdColWrap>,
|
||||
column_filter::caller<24, T, D, BrdColWrap>,
|
||||
column_filter::caller<25, T, D, BrdColWrap>,
|
||||
column_filter::caller<26, T, D, BrdColWrap>,
|
||||
column_filter::caller<27, T, D, BrdColWrap>,
|
||||
column_filter::caller<28, T, D, BrdColWrap>,
|
||||
column_filter::caller<29, T, D, BrdColWrap>,
|
||||
column_filter::caller<30, T, D, BrdColWrap>,
|
||||
column_filter::caller<31, T, D, BrdColWrap>,
|
||||
column_filter::caller<32, T, D, BrdColWrap>
|
||||
}
|
||||
};
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
|
||||
else
|
||||
cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
|
||||
|
||||
callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -46,6 +46,8 @@
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/utility.hpp"
|
||||
#include "opencv2/gpu/device/reduce.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
#include "fgd_bgfg_common.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
@ -181,57 +183,8 @@ namespace bgfg
|
||||
__shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
|
||||
__shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
|
||||
|
||||
data0[threadIdx.x] = sum0;
|
||||
data1[threadIdx.x] = sum1;
|
||||
data2[threadIdx.x] = sum2;
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < 128)
|
||||
{
|
||||
data0[threadIdx.x] = sum0 += data0[threadIdx.x + 128];
|
||||
data1[threadIdx.x] = sum1 += data1[threadIdx.x + 128];
|
||||
data2[threadIdx.x] = sum2 += data2[threadIdx.x + 128];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < 64)
|
||||
{
|
||||
data0[threadIdx.x] = sum0 += data0[threadIdx.x + 64];
|
||||
data1[threadIdx.x] = sum1 += data1[threadIdx.x + 64];
|
||||
data2[threadIdx.x] = sum2 += data2[threadIdx.x + 64];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < 32)
|
||||
{
|
||||
volatile unsigned int* vdata0 = data0;
|
||||
volatile unsigned int* vdata1 = data1;
|
||||
volatile unsigned int* vdata2 = data2;
|
||||
|
||||
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 32];
|
||||
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 32];
|
||||
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 32];
|
||||
|
||||
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 16];
|
||||
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 16];
|
||||
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 16];
|
||||
|
||||
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 8];
|
||||
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 8];
|
||||
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 8];
|
||||
|
||||
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 4];
|
||||
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 4];
|
||||
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 4];
|
||||
|
||||
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 2];
|
||||
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 2];
|
||||
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 2];
|
||||
|
||||
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 1];
|
||||
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 1];
|
||||
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 1];
|
||||
}
|
||||
plus<unsigned int> op;
|
||||
reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
{
|
||||
@ -245,9 +198,9 @@ namespace bgfg
|
||||
void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
|
||||
unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
|
||||
unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
|
||||
int cc, cudaStream_t stream)
|
||||
bool cc20, cudaStream_t stream)
|
||||
{
|
||||
const int HISTOGRAM_WARP_COUNT = cc < 20 ? 4 : 6;
|
||||
const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
|
||||
const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
|
||||
|
||||
calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
|
||||
@ -261,10 +214,10 @@ namespace bgfg
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
|
||||
template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
|
||||
template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
|
||||
template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
|
||||
template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
|
||||
template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
|
||||
template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
|
||||
template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
// calcDiffThreshMask
|
||||
|
@ -125,7 +125,7 @@ namespace bgfg
|
||||
void calcDiffHistogram_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
|
||||
unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
|
||||
unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
|
||||
int cc, cudaStream_t stream);
|
||||
bool cc20, cudaStream_t stream);
|
||||
|
||||
template <typename PT, typename CT>
|
||||
void calcDiffThreshMask_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);
|
||||
|
@ -47,6 +47,7 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/sort.h>
|
||||
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
|
@ -43,12 +43,10 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "thrust/device_ptr.h"
|
||||
#include "thrust/remove.h"
|
||||
#include "thrust/functional.h"
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
using namespace thrust;
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/remove.h>
|
||||
#include <thrust/functional.h>
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device { namespace globmotion {
|
||||
|
||||
@ -61,10 +59,10 @@ int compactPoints(int N, float *points0, float *points1, const uchar *mask)
|
||||
thrust::device_ptr<float2> dpoints1((float2*)points1);
|
||||
thrust::device_ptr<const uchar> dmask(mask);
|
||||
|
||||
return thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
|
||||
return (int)(thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
|
||||
thrust::make_zip_iterator(thrust::make_tuple(dpoints0 + N, dpoints1 + N)),
|
||||
dmask, thrust::not1(thrust::identity<uchar>()))
|
||||
- make_zip_iterator(make_tuple(dpoints0, dpoints1));
|
||||
- thrust::make_zip_iterator(make_tuple(dpoints0, dpoints1)));
|
||||
}
|
||||
|
||||
|
||||
|
@ -43,182 +43,112 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/utility.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
#include "opencv2/gpu/device/emulation.hpp"
|
||||
#include "opencv2/gpu/device/transform.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace hist
|
||||
{
|
||||
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
|
||||
{
|
||||
__shared__ int shist[256];
|
||||
|
||||
const int y = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
shist[tid] = 0;
|
||||
__syncthreads();
|
||||
|
||||
if (y < rows)
|
||||
{
|
||||
const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
|
||||
|
||||
const int cols_4 = cols / 4;
|
||||
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
|
||||
{
|
||||
unsigned int data = rowPtr[x];
|
||||
|
||||
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
|
||||
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
|
||||
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
|
||||
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
|
||||
}
|
||||
|
||||
if (cols % 4 != 0 && threadIdx.x == 0)
|
||||
{
|
||||
for (int x = cols_4 * 4; x < cols; ++x)
|
||||
{
|
||||
unsigned int data = ((const uchar*)rowPtr)[x];
|
||||
Emulation::smem::atomicAdd(&shist[data], 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int histVal = shist[tid];
|
||||
if (histVal > 0)
|
||||
::atomicAdd(hist + tid, histVal);
|
||||
}
|
||||
|
||||
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(src.rows, block.y));
|
||||
|
||||
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace hist
|
||||
{
|
||||
__constant__ int c_lut[256];
|
||||
|
||||
struct EqualizeHist : unary_function<uchar, uchar>
|
||||
{
|
||||
float scale;
|
||||
|
||||
__host__ EqualizeHist(float _scale) : scale(_scale) {}
|
||||
|
||||
__device__ __forceinline__ uchar operator ()(uchar val) const
|
||||
{
|
||||
const int lut = c_lut[val];
|
||||
return __float2int_rn(scale * lut);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
#define UINT_BITS 32U
|
||||
|
||||
//Warps == subhistograms per threadblock
|
||||
#define WARP_COUNT 6
|
||||
|
||||
//Threadblock size
|
||||
#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
|
||||
#define HISTOGRAM256_BIN_COUNT 256
|
||||
|
||||
//Shared memory per threadblock
|
||||
#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
|
||||
|
||||
#define PARTIAL_HISTOGRAM256_COUNT 240
|
||||
|
||||
#define MERGE_THREADBLOCK_SIZE 256
|
||||
|
||||
#define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120))
|
||||
|
||||
namespace hist
|
||||
template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
|
||||
{
|
||||
#if (!USE_SMEM_ATOMICS)
|
||||
|
||||
#define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
|
||||
|
||||
__forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
|
||||
{
|
||||
uint count;
|
||||
do
|
||||
{
|
||||
count = s_WarpHist[data] & TAG_MASK;
|
||||
count = threadTag | (count + 1);
|
||||
s_WarpHist[data] = count;
|
||||
} while (s_WarpHist[data] != count);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define TAG_MASK 0xFFFFFFFFU
|
||||
|
||||
__forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
|
||||
{
|
||||
atomicAdd(s_WarpHist + data, 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
|
||||
{
|
||||
uint x = pos_x << 2;
|
||||
|
||||
if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag);
|
||||
if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag);
|
||||
if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
|
||||
if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
|
||||
}
|
||||
|
||||
__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
|
||||
{
|
||||
//Per-warp subhistogram storage
|
||||
__shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
|
||||
uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
|
||||
|
||||
//Clear shared memory storage for current threadblock before processing
|
||||
#pragma unroll
|
||||
for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
|
||||
s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
|
||||
|
||||
//Cycle through the entire data set, update subhistograms for each warp
|
||||
const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
|
||||
|
||||
__syncthreads();
|
||||
const uint colsui = d_Data.step / sizeof(uint);
|
||||
for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
|
||||
{
|
||||
uint pos_y = pos / colsui;
|
||||
uint pos_x = pos % colsui;
|
||||
uint data = d_Data.ptr(pos_y)[pos_x];
|
||||
addWord(s_WarpHist, data, tag, pos_x, cols);
|
||||
}
|
||||
|
||||
//Merge per-warp histograms into per-block and write to global memory
|
||||
__syncthreads();
|
||||
for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
|
||||
{
|
||||
uint sum = 0;
|
||||
|
||||
for (uint i = 0; i < WARP_COUNT; i++)
|
||||
sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
|
||||
|
||||
d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge histogram256() output
|
||||
// Run one threadblock per bin; each threadblock adds up the same bin counter
|
||||
// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
|
||||
// takes only a fraction of total processing time
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
|
||||
{
|
||||
uint sum = 0;
|
||||
|
||||
#pragma unroll
|
||||
for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
|
||||
sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
|
||||
|
||||
__shared__ uint data[MERGE_THREADBLOCK_SIZE];
|
||||
data[threadIdx.x] = sum;
|
||||
|
||||
for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
__syncthreads();
|
||||
if(threadIdx.x < stride)
|
||||
data[threadIdx.x] += data[threadIdx.x + stride];
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
|
||||
}
|
||||
|
||||
void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream)
|
||||
{
|
||||
histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
|
||||
PtrStepSz<uint>(src),
|
||||
buf,
|
||||
static_cast<uint>(src.rows * src.step / sizeof(uint)),
|
||||
src.cols);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
__constant__ int c_lut[256];
|
||||
|
||||
__global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < src.cols && y < src.rows)
|
||||
{
|
||||
const uchar val = src.ptr(y)[x];
|
||||
const int lut = c_lut[val];
|
||||
dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
|
||||
}
|
||||
}
|
||||
|
||||
void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(16, 16);
|
||||
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace hist
|
||||
{
|
||||
void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
|
||||
{
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
|
||||
else
|
||||
cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
|
||||
|
||||
equalizeHist<<<grid, block, 0, stream>>>(src, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
} // namespace hist
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
const float scale = 255.0f / (src.cols * src.rows);
|
||||
|
||||
transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
@ -42,7 +42,10 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/reduce.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
#include "opencv2/gpu/device/warp_shuffle.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@ -226,29 +229,32 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
|
||||
template<int size>
|
||||
__device__ float reduce_smem(volatile float* smem)
|
||||
__device__ float reduce_smem(float* smem, float val)
|
||||
{
|
||||
unsigned int tid = threadIdx.x;
|
||||
float sum = smem[tid];
|
||||
float sum = val;
|
||||
|
||||
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }
|
||||
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
|
||||
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
|
||||
reduce<size>(smem, sum, tid, plus<float>());
|
||||
|
||||
if (tid < 32)
|
||||
if (size == 32)
|
||||
{
|
||||
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
|
||||
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
|
||||
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
|
||||
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
|
||||
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
|
||||
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
return shfl(sum, 0);
|
||||
#else
|
||||
return smem[0];
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
if (threadIdx.x == 0)
|
||||
smem[0] = sum;
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
sum = smem[0];
|
||||
__syncthreads();
|
||||
|
||||
return sum;
|
||||
return smem[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -272,19 +278,13 @@ namespace cv { namespace gpu { namespace device
|
||||
if (threadIdx.x < block_hist_size)
|
||||
elem = hist[0];
|
||||
|
||||
squares[threadIdx.x] = elem * elem;
|
||||
|
||||
__syncthreads();
|
||||
float sum = reduce_smem<nthreads>(squares);
|
||||
float sum = reduce_smem<nthreads>(squares, elem * elem);
|
||||
|
||||
float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
|
||||
elem = ::min(elem * scale, threshold);
|
||||
|
||||
__syncthreads();
|
||||
squares[threadIdx.x] = elem * elem;
|
||||
sum = reduce_smem<nthreads>(squares, elem * elem);
|
||||
|
||||
__syncthreads();
|
||||
sum = reduce_smem<nthreads>(squares);
|
||||
scale = 1.0f / (::sqrtf(sum) + 1e-3f);
|
||||
|
||||
if (threadIdx.x < block_hist_size)
|
||||
@ -330,65 +330,36 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
// return confidence values not just positive location
|
||||
template <int nthreads, // Number of threads per one histogram block
|
||||
int nblocks> // Number of histogram block processed by single GPU thread block
|
||||
int nblocks> // Number of histogram block processed by single GPU thread block
|
||||
__global__ void compute_confidence_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
|
||||
const int win_block_stride_x, const int win_block_stride_y,
|
||||
const float* block_hists, const float* coefs,
|
||||
float free_coef, float threshold, float* confidences)
|
||||
{
|
||||
const int win_x = threadIdx.z;
|
||||
if (blockIdx.x * blockDim.z + win_x >= img_win_width)
|
||||
return;
|
||||
const int win_x = threadIdx.z;
|
||||
if (blockIdx.x * blockDim.z + win_x >= img_win_width)
|
||||
return;
|
||||
|
||||
const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
|
||||
blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
|
||||
cblock_hist_size;
|
||||
const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
|
||||
blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
|
||||
cblock_hist_size;
|
||||
|
||||
float product = 0.f;
|
||||
for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
|
||||
{
|
||||
int offset_y = i / cdescr_width;
|
||||
int offset_x = i - offset_y * cdescr_width;
|
||||
product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
|
||||
}
|
||||
float product = 0.f;
|
||||
for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
|
||||
{
|
||||
int offset_y = i / cdescr_width;
|
||||
int offset_x = i - offset_y * cdescr_width;
|
||||
product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
|
||||
}
|
||||
|
||||
__shared__ float products[nthreads * nblocks];
|
||||
__shared__ float products[nthreads * nblocks];
|
||||
|
||||
const int tid = threadIdx.z * nthreads + threadIdx.x;
|
||||
products[tid] = product;
|
||||
const int tid = threadIdx.z * nthreads + threadIdx.x;
|
||||
|
||||
__syncthreads();
|
||||
reduce<nthreads>(products, product, tid, plus<float>());
|
||||
|
||||
if (nthreads >= 512)
|
||||
{
|
||||
if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
|
||||
__syncthreads();
|
||||
}
|
||||
if (nthreads >= 256)
|
||||
{
|
||||
if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
|
||||
__syncthreads();
|
||||
}
|
||||
if (nthreads >= 128)
|
||||
{
|
||||
if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (threadIdx.x < 32)
|
||||
{
|
||||
volatile float* smem = products;
|
||||
if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
|
||||
if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
|
||||
if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
|
||||
if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
|
||||
if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
|
||||
if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x]
|
||||
= (float)(product + free_coef);
|
||||
if (threadIdx.x == 0)
|
||||
confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef;
|
||||
|
||||
}
|
||||
|
||||
@ -396,32 +367,32 @@ namespace cv { namespace gpu { namespace device
|
||||
int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
|
||||
float* coefs, float free_coef, float threshold, float *confidences)
|
||||
{
|
||||
const int nthreads = 256;
|
||||
const int nblocks = 1;
|
||||
const int nthreads = 256;
|
||||
const int nblocks = 1;
|
||||
|
||||
int win_block_stride_x = win_stride_x / block_stride_x;
|
||||
int win_block_stride_y = win_stride_y / block_stride_y;
|
||||
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
|
||||
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
|
||||
int win_block_stride_x = win_stride_x / block_stride_x;
|
||||
int win_block_stride_y = win_stride_y / block_stride_y;
|
||||
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
|
||||
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
|
||||
|
||||
dim3 threads(nthreads, 1, nblocks);
|
||||
dim3 grid(divUp(img_win_width, nblocks), img_win_height);
|
||||
dim3 threads(nthreads, 1, nblocks);
|
||||
dim3 grid(divUp(img_win_width, nblocks), img_win_height);
|
||||
|
||||
cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
|
||||
cudaFuncCachePreferL1));
|
||||
cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
|
||||
cudaFuncCachePreferL1));
|
||||
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
|
||||
block_stride_x;
|
||||
compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
|
||||
img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
|
||||
block_hists, coefs, free_coef, threshold, confidences);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
|
||||
block_stride_x;
|
||||
compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
|
||||
img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
|
||||
block_hists, coefs, free_coef, threshold, confidences);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <int nthreads, // Number of threads per one histogram block
|
||||
int nblocks> // Number of histogram block processed by single GPU thread block
|
||||
int nblocks> // Number of histogram block processed by single GPU thread block
|
||||
__global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
|
||||
const int win_block_stride_x, const int win_block_stride_y,
|
||||
const float* block_hists, const float* coefs,
|
||||
@ -446,36 +417,8 @@ namespace cv { namespace gpu { namespace device
|
||||
__shared__ float products[nthreads * nblocks];
|
||||
|
||||
const int tid = threadIdx.z * nthreads + threadIdx.x;
|
||||
products[tid] = product;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (nthreads >= 512)
|
||||
{
|
||||
if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
|
||||
__syncthreads();
|
||||
}
|
||||
if (nthreads >= 256)
|
||||
{
|
||||
if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
|
||||
__syncthreads();
|
||||
}
|
||||
if (nthreads >= 128)
|
||||
{
|
||||
if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (threadIdx.x < 32)
|
||||
{
|
||||
volatile float* smem = products;
|
||||
if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
|
||||
if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
|
||||
if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
|
||||
if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
|
||||
if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
|
||||
if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
|
||||
}
|
||||
reduce<nthreads>(products, product, tid, plus<float>());
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
|
||||
|
@ -42,7 +42,9 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/sort.h>
|
||||
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/emulation.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
|
@ -295,7 +295,7 @@ namespace cv { namespace gpu { namespace device
|
||||
int grid = divUp(workAmount, block);
|
||||
cudaFuncSetCacheConfig(lbp_cascade, cudaFuncCachePreferL1);
|
||||
Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
|
||||
lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), integral.step / sizeof(int), objects, classified);
|
||||
lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), (int)integral.step / sizeof(int), objects, classified);
|
||||
}
|
||||
}
|
||||
}}}
|
||||
|
@ -76,7 +76,7 @@ namespace cv { namespace gpu { namespace device
|
||||
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
|
||||
{
|
||||
float angle = ::atan2f(y_data, x_data);
|
||||
angle += (angle < 0) * 2.0 * CV_PI;
|
||||
angle += (angle < 0) * 2.0f * CV_PI_F;
|
||||
dst[y * dst_step + x] = scale * angle;
|
||||
}
|
||||
};
|
||||
@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace device
|
||||
grid.x = divUp(x.cols, threads.x);
|
||||
grid.y = divUp(x.rows, threads.y);
|
||||
|
||||
const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
|
||||
const float scale = angleInDegrees ? (180.0f / CV_PI_F) : 1.f;
|
||||
|
||||
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
|
||||
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
|
||||
@ -190,7 +190,7 @@ namespace cv { namespace gpu { namespace device
|
||||
grid.x = divUp(mag.cols, threads.x);
|
||||
grid.y = divUp(mag.rows, threads.y);
|
||||
|
||||
const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
|
||||
const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
|
||||
|
||||
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
|
||||
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -43,11 +43,11 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/vec_traits.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/block.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
#include "opencv2/gpu/device/reduce.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
@ -184,6 +184,85 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace imgproc
|
||||
{
|
||||
|
||||
template <int cn> struct Unroll;
|
||||
template <> struct Unroll<1>
|
||||
{
|
||||
template <int BLOCK_SIZE>
|
||||
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
|
||||
{
|
||||
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
|
||||
{
|
||||
return thrust::tie(val1, val2);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
|
||||
{
|
||||
plus<float> op;
|
||||
return thrust::make_tuple(op, op);
|
||||
}
|
||||
};
|
||||
template <> struct Unroll<2>
|
||||
{
|
||||
template <int BLOCK_SIZE>
|
||||
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
|
||||
{
|
||||
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
|
||||
{
|
||||
return thrust::tie(val1, val2.x, val2.y);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
|
||||
{
|
||||
plus<float> op;
|
||||
return thrust::make_tuple(op, op, op);
|
||||
}
|
||||
};
|
||||
template <> struct Unroll<3>
|
||||
{
|
||||
template <int BLOCK_SIZE>
|
||||
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
|
||||
{
|
||||
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
|
||||
{
|
||||
return thrust::tie(val1, val2.x, val2.y, val2.z);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
|
||||
{
|
||||
plus<float> op;
|
||||
return thrust::make_tuple(op, op, op, op);
|
||||
}
|
||||
};
|
||||
template <> struct Unroll<4>
|
||||
{
|
||||
template <int BLOCK_SIZE>
|
||||
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
|
||||
{
|
||||
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
|
||||
{
|
||||
return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op()
|
||||
{
|
||||
plus<float> op;
|
||||
return thrust::make_tuple(op, op, op, op, op);
|
||||
}
|
||||
};
|
||||
|
||||
__device__ __forceinline__ int calcDist(const uchar& a, const uchar& b) { return (a-b)*(a-b); }
|
||||
__device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
|
||||
__device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
|
||||
@ -340,30 +419,15 @@ namespace cv { namespace gpu { namespace device
|
||||
sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
|
||||
}
|
||||
|
||||
volatile __shared__ float cta_buffer[CTA_SIZE];
|
||||
__shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)];
|
||||
|
||||
int tid = threadIdx.x;
|
||||
reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer),
|
||||
Unroll<VecTraits<T>::cn>::tie(weights_sum, sum),
|
||||
threadIdx.x,
|
||||
Unroll<VecTraits<T>::cn>::op());
|
||||
|
||||
cta_buffer[tid] = weights_sum;
|
||||
__syncthreads();
|
||||
Block::reduce<CTA_SIZE>(cta_buffer, plus());
|
||||
weights_sum = cta_buffer[0];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
for(int n = 0; n < VecTraits<T>::cn; ++n)
|
||||
{
|
||||
cta_buffer[tid] = reinterpret_cast<float*>(&sum)[n];
|
||||
__syncthreads();
|
||||
Block::reduce<CTA_SIZE>(cta_buffer, plus());
|
||||
reinterpret_cast<float*>(&sum)[n] = cta_buffer[0];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (tid == 0)
|
||||
dst = saturate_cast<T>(sum/weights_sum);
|
||||
if (threadIdx.x == 0)
|
||||
dst = saturate_cast<T>(sum / weights_sum);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const
|
||||
|
@ -164,40 +164,40 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
r = ::fmin(r, 2.5f);
|
||||
|
||||
v[1].x = arrow_x + r * ::cosf(theta - CV_PI / 2.0f);
|
||||
v[1].y = arrow_y + r * ::sinf(theta - CV_PI / 2.0f);
|
||||
v[1].x = arrow_x + r * ::cosf(theta - CV_PI_F / 2.0f);
|
||||
v[1].y = arrow_y + r * ::sinf(theta - CV_PI_F / 2.0f);
|
||||
|
||||
v[4].x = arrow_x + r * ::cosf(theta + CV_PI / 2.0f);
|
||||
v[4].y = arrow_y + r * ::sinf(theta + CV_PI / 2.0f);
|
||||
v[4].x = arrow_x + r * ::cosf(theta + CV_PI_F / 2.0f);
|
||||
v[4].y = arrow_y + r * ::sinf(theta + CV_PI_F / 2.0f);
|
||||
|
||||
int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
|
||||
|
||||
color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
|
||||
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
|
||||
vertex_data[indx++] = v[0].x * xscale;
|
||||
vertex_data[indx++] = v[0].y * yscale;
|
||||
vertex_data[indx++] = v[0].z;
|
||||
|
||||
color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
|
||||
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
|
||||
vertex_data[indx++] = v[1].x * xscale;
|
||||
vertex_data[indx++] = v[1].y * yscale;
|
||||
vertex_data[indx++] = v[1].z;
|
||||
|
||||
color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
|
||||
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
|
||||
vertex_data[indx++] = v[2].x * xscale;
|
||||
vertex_data[indx++] = v[2].y * yscale;
|
||||
vertex_data[indx++] = v[2].z;
|
||||
|
||||
color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
|
||||
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
|
||||
vertex_data[indx++] = v[3].x * xscale;
|
||||
vertex_data[indx++] = v[3].y * yscale;
|
||||
vertex_data[indx++] = v[3].z;
|
||||
|
||||
color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
|
||||
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
|
||||
vertex_data[indx++] = v[4].x * xscale;
|
||||
vertex_data[indx++] = v[4].y * yscale;
|
||||
vertex_data[indx++] = v[4].z;
|
||||
|
||||
color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
|
||||
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
|
||||
vertex_data[indx++] = v[5].x * xscale;
|
||||
vertex_data[indx++] = v[5].y * yscale;
|
||||
vertex_data[indx++] = v[5].z;
|
||||
|
@ -42,7 +42,6 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include <stdio.h>
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
@ -57,8 +56,6 @@
|
||||
#define BORDER_SIZE 5
|
||||
#define MAX_KSIZE_HALF 100
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
||||
{
|
||||
__constant__ float c_g[8];
|
||||
|
@ -47,10 +47,11 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/sort.h>
|
||||
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/utility.hpp"
|
||||
#include "opencv2/gpu/device/reduce.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
@ -75,9 +76,9 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
__global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
|
||||
{
|
||||
__shared__ int smem[8 * 32];
|
||||
|
||||
volatile int* srow = smem + threadIdx.y * blockDim.x;
|
||||
__shared__ int smem0[8 * 32];
|
||||
__shared__ int smem1[8 * 32];
|
||||
__shared__ int smem2[8 * 32];
|
||||
|
||||
const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
|
||||
@ -109,9 +110,12 @@ namespace cv { namespace gpu { namespace device
|
||||
c += Ix * Iy;
|
||||
}
|
||||
|
||||
reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
|
||||
reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
|
||||
reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
|
||||
int* srow0 = smem0 + threadIdx.y * blockDim.x;
|
||||
int* srow1 = smem1 + threadIdx.y * blockDim.x;
|
||||
int* srow2 = smem2 + threadIdx.y * blockDim.x;
|
||||
|
||||
plus<int> op;
|
||||
reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
@ -151,9 +155,13 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
__global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
|
||||
{
|
||||
__shared__ int smem[8 * 32];
|
||||
__shared__ int smem0[8 * 32];
|
||||
__shared__ int smem1[8 * 32];
|
||||
|
||||
volatile int* srow = smem + threadIdx.y * blockDim.x;
|
||||
int* srow0 = smem0 + threadIdx.y * blockDim.x;
|
||||
int* srow1 = smem1 + threadIdx.y * blockDim.x;
|
||||
|
||||
plus<int> op;
|
||||
|
||||
const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
|
||||
@ -167,7 +175,7 @@ namespace cv { namespace gpu { namespace device
|
||||
for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
|
||||
m_10 += u * image(loc.y, loc.x + u);
|
||||
|
||||
reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
|
||||
reduce<32>(srow0, m_10, threadIdx.x, op);
|
||||
|
||||
for (int v = 1; v <= half_k; ++v)
|
||||
{
|
||||
@ -185,8 +193,7 @@ namespace cv { namespace gpu { namespace device
|
||||
m_sum += u * (val_plus + val_minus);
|
||||
}
|
||||
|
||||
reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
|
||||
reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
|
||||
reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));
|
||||
|
||||
m_10 += m_sum;
|
||||
m_01 += v * v_sum;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -69,7 +69,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
|
||||
{
|
||||
static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
|
||||
static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
|
||||
|
||||
@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
|
||||
{
|
||||
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, int)
|
||||
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, bool)
|
||||
{
|
||||
(void)srcWhole;
|
||||
(void)xoff;
|
||||
@ -124,10 +124,10 @@ namespace cv { namespace gpu { namespace device
|
||||
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
|
||||
{ \
|
||||
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
|
||||
PtrStepSz< type > dst, const float* borderValue, int cc) \
|
||||
PtrStepSz< type > dst, const float* borderValue, bool cc20) \
|
||||
{ \
|
||||
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
|
||||
dim3 block(32, cc >= 20 ? 8 : 4); \
|
||||
dim3 block(32, cc20 ? 8 : 4); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
bindTexture(&tex_remap_ ## type , srcWhole); \
|
||||
tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
|
||||
@ -142,7 +142,7 @@ namespace cv { namespace gpu { namespace device
|
||||
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
|
||||
{ \
|
||||
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
|
||||
PtrStepSz< type > dst, const float*, int) \
|
||||
PtrStepSz< type > dst, const float*, bool) \
|
||||
{ \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
@ -194,20 +194,20 @@ namespace cv { namespace gpu { namespace device
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
|
||||
{
|
||||
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
|
||||
PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
|
||||
PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
|
||||
{
|
||||
if (stream == 0)
|
||||
RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc);
|
||||
RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc20);
|
||||
else
|
||||
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
|
||||
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc20);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
|
||||
PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)
|
||||
PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
|
||||
{
|
||||
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
|
||||
PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
|
||||
PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
static const caller_t callers[3][5] =
|
||||
{
|
||||
@ -235,38 +235,38 @@ namespace cv { namespace gpu { namespace device
|
||||
};
|
||||
|
||||
callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
|
||||
static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
|
||||
static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
|
||||
}
|
||||
|
||||
template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
//template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
//template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
} // namespace imgproc
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
|
53
modules/gpu/src/cuda/row_filter.0.cu
Normal file
53
modules/gpu/src/cuda/row_filter.0.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<uchar, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/row_filter.1.cu
Normal file
53
modules/gpu/src/cuda/row_filter.1.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<uchar3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/row_filter.10.cu
Normal file
53
modules/gpu/src/cuda/row_filter.10.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<unsigned short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/row_filter.11.cu
Normal file
53
modules/gpu/src/cuda/row_filter.11.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<ushort3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/row_filter.12.cu
Normal file
53
modules/gpu/src/cuda/row_filter.12.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<ushort4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/row_filter.13.cu
Normal file
53
modules/gpu/src/cuda/row_filter.13.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<int3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/row_filter.14.cu
Normal file
53
modules/gpu/src/cuda/row_filter.14.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<int4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/row_filter.2.cu
Normal file
53
modules/gpu/src/cuda/row_filter.2.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/row_filter.3.cu
Normal file
53
modules/gpu/src/cuda/row_filter.3.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/row_filter.4.cu
Normal file
53
modules/gpu/src/cuda/row_filter.4.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<int, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/row_filter.5.cu
Normal file
53
modules/gpu/src/cuda/row_filter.5.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/row_filter.6.cu
Normal file
53
modules/gpu/src/cuda/row_filter.6.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/row_filter.7.cu
Normal file
53
modules/gpu/src/cuda/row_filter.7.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/row_filter.8.cu
Normal file
53
modules/gpu/src/cuda/row_filter.8.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
53
modules/gpu/src/cuda/row_filter.9.cu
Normal file
53
modules/gpu/src/cuda/row_filter.9.cu
Normal file
@ -0,0 +1,53 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "row_filter.h"
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template void linearRow<short4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
@ -1,390 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
#include "opencv2/gpu/device/static_check.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace row_filter
|
||||
{
|
||||
#define MAX_KERNEL_SIZE 32
|
||||
|
||||
__constant__ float c_kernel[MAX_KERNEL_SIZE];
|
||||
|
||||
void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
|
||||
{
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
|
||||
else
|
||||
cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
|
||||
}
|
||||
|
||||
template <int KSIZE, typename T, typename D, typename B>
|
||||
__global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
|
||||
{
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
|
||||
const int BLOCK_DIM_X = 32;
|
||||
const int BLOCK_DIM_Y = 8;
|
||||
const int PATCH_PER_BLOCK = 4;
|
||||
const int HALO_SIZE = 1;
|
||||
#else
|
||||
const int BLOCK_DIM_X = 32;
|
||||
const int BLOCK_DIM_Y = 4;
|
||||
const int PATCH_PER_BLOCK = 4;
|
||||
const int HALO_SIZE = 1;
|
||||
#endif
|
||||
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
|
||||
|
||||
__shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
|
||||
|
||||
const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
|
||||
|
||||
if (y >= src.rows)
|
||||
return;
|
||||
|
||||
const T* src_row = src.ptr(y);
|
||||
|
||||
const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
|
||||
|
||||
if (blockIdx.x > 0)
|
||||
{
|
||||
//Load left halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
|
||||
}
|
||||
else
|
||||
{
|
||||
//Load left halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
|
||||
}
|
||||
|
||||
if (blockIdx.x + 2 < gridDim.x)
|
||||
{
|
||||
//Load main data
|
||||
#pragma unroll
|
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
|
||||
smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
|
||||
|
||||
//Load right halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
|
||||
}
|
||||
else
|
||||
{
|
||||
//Load main data
|
||||
#pragma unroll
|
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
|
||||
smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
|
||||
|
||||
//Load right halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
|
||||
{
|
||||
const int x = xStart + j * BLOCK_DIM_X;
|
||||
|
||||
if (x < src.cols)
|
||||
{
|
||||
sum_t sum = VecTraits<sum_t>::all(0);
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < KSIZE; ++k)
|
||||
sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
|
||||
|
||||
dst(y, x) = saturate_cast<D>(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int KSIZE, typename T, typename D, template<typename> class B>
|
||||
void linearRowFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
|
||||
{
|
||||
int BLOCK_DIM_X;
|
||||
int BLOCK_DIM_Y;
|
||||
int PATCH_PER_BLOCK;
|
||||
|
||||
if (cc >= 20)
|
||||
{
|
||||
BLOCK_DIM_X = 32;
|
||||
BLOCK_DIM_Y = 8;
|
||||
PATCH_PER_BLOCK = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
BLOCK_DIM_X = 32;
|
||||
BLOCK_DIM_Y = 4;
|
||||
PATCH_PER_BLOCK = 4;
|
||||
}
|
||||
|
||||
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
|
||||
|
||||
B<T> brd(src.cols);
|
||||
|
||||
linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, typename D>
|
||||
void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[5][33] =
|
||||
{
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller< 1, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller< 2, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller< 3, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller< 4, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller< 5, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller< 6, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller< 7, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller< 8, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller< 9, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<17, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<18, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<19, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<20, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<21, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<22, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<23, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<24, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<25, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<26, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<27, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<28, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<29, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<30, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<31, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<32, T, D, BrdRowReflect101>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller< 1, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller< 2, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller< 3, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller< 4, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller< 5, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller< 6, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller< 7, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller< 8, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller< 9, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<17, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<18, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<19, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<20, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<21, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<22, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<23, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<24, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<25, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<26, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<27, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<28, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<29, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<30, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<31, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<32, T, D, BrdRowReplicate>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller< 1, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller< 2, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller< 3, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller< 4, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller< 5, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller< 6, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller< 7, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller< 8, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller< 9, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<17, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<18, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<19, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<20, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<21, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<22, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<23, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<24, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<25, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<26, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<27, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<28, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<29, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<30, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<31, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<32, T, D, BrdRowConstant>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller< 1, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller< 2, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller< 3, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller< 4, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller< 5, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller< 6, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller< 7, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller< 8, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller< 9, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<17, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<18, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<19, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<20, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<21, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<22, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<23, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<24, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<25, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<26, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<27, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<28, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<29, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<30, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<31, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<32, T, D, BrdRowReflect>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller< 1, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller< 2, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller< 3, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller< 4, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller< 5, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller< 6, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller< 7, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller< 8, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller< 9, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<17, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<18, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<19, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<20, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<21, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<22, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<23, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<24, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<25, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<26, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<27, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<28, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<29, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<30, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<31, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<32, T, D, BrdRowWrap>
|
||||
}
|
||||
};
|
||||
|
||||
loadKernel(kernel, ksize, stream);
|
||||
|
||||
callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
|
||||
}
|
||||
|
||||
template void linearRowFilter_gpu<uchar , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<uchar3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<int , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
|
||||
} // namespace row_filter
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
372
modules/gpu/src/cuda/row_filter.h
Normal file
372
modules/gpu/src/cuda/row_filter.h
Normal file
@ -0,0 +1,372 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace row_filter
|
||||
{
|
||||
#define MAX_KERNEL_SIZE 32
|
||||
|
||||
__constant__ float c_kernel[MAX_KERNEL_SIZE];
|
||||
|
||||
template <int KSIZE, typename T, typename D, typename B>
|
||||
__global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
|
||||
{
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
|
||||
const int BLOCK_DIM_X = 32;
|
||||
const int BLOCK_DIM_Y = 8;
|
||||
const int PATCH_PER_BLOCK = 4;
|
||||
const int HALO_SIZE = 1;
|
||||
#else
|
||||
const int BLOCK_DIM_X = 32;
|
||||
const int BLOCK_DIM_Y = 4;
|
||||
const int PATCH_PER_BLOCK = 4;
|
||||
const int HALO_SIZE = 1;
|
||||
#endif
|
||||
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
|
||||
|
||||
__shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
|
||||
|
||||
const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
|
||||
|
||||
if (y >= src.rows)
|
||||
return;
|
||||
|
||||
const T* src_row = src.ptr(y);
|
||||
|
||||
const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
|
||||
|
||||
if (blockIdx.x > 0)
|
||||
{
|
||||
//Load left halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
|
||||
}
|
||||
else
|
||||
{
|
||||
//Load left halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
|
||||
}
|
||||
|
||||
if (blockIdx.x + 2 < gridDim.x)
|
||||
{
|
||||
//Load main data
|
||||
#pragma unroll
|
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
|
||||
smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
|
||||
|
||||
//Load right halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
|
||||
}
|
||||
else
|
||||
{
|
||||
//Load main data
|
||||
#pragma unroll
|
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
|
||||
smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
|
||||
|
||||
//Load right halo
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HALO_SIZE; ++j)
|
||||
smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
|
||||
{
|
||||
const int x = xStart + j * BLOCK_DIM_X;
|
||||
|
||||
if (x < src.cols)
|
||||
{
|
||||
sum_t sum = VecTraits<sum_t>::all(0);
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < KSIZE; ++k)
|
||||
sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
|
||||
|
||||
dst(y, x) = saturate_cast<D>(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int KSIZE, typename T, typename D, template<typename> class B>
|
||||
void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
|
||||
{
|
||||
int BLOCK_DIM_X;
|
||||
int BLOCK_DIM_Y;
|
||||
int PATCH_PER_BLOCK;
|
||||
|
||||
if (cc >= 20)
|
||||
{
|
||||
BLOCK_DIM_X = 32;
|
||||
BLOCK_DIM_Y = 8;
|
||||
PATCH_PER_BLOCK = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
BLOCK_DIM_X = 32;
|
||||
BLOCK_DIM_Y = 4;
|
||||
PATCH_PER_BLOCK = 4;
|
||||
}
|
||||
|
||||
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
|
||||
|
||||
B<T> brd(src.cols);
|
||||
|
||||
linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
namespace filter
|
||||
{
|
||||
template <typename T, typename D>
|
||||
void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[5][33] =
|
||||
{
|
||||
{
|
||||
0,
|
||||
row_filter::caller< 1, T, D, BrdRowReflect101>,
|
||||
row_filter::caller< 2, T, D, BrdRowReflect101>,
|
||||
row_filter::caller< 3, T, D, BrdRowReflect101>,
|
||||
row_filter::caller< 4, T, D, BrdRowReflect101>,
|
||||
row_filter::caller< 5, T, D, BrdRowReflect101>,
|
||||
row_filter::caller< 6, T, D, BrdRowReflect101>,
|
||||
row_filter::caller< 7, T, D, BrdRowReflect101>,
|
||||
row_filter::caller< 8, T, D, BrdRowReflect101>,
|
||||
row_filter::caller< 9, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<10, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<11, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<12, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<13, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<14, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<15, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<16, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<17, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<18, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<19, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<20, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<21, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<22, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<23, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<24, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<25, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<26, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<27, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<28, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<29, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<30, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<31, T, D, BrdRowReflect101>,
|
||||
row_filter::caller<32, T, D, BrdRowReflect101>
|
||||
},
|
||||
{
|
||||
0,
|
||||
row_filter::caller< 1, T, D, BrdRowReplicate>,
|
||||
row_filter::caller< 2, T, D, BrdRowReplicate>,
|
||||
row_filter::caller< 3, T, D, BrdRowReplicate>,
|
||||
row_filter::caller< 4, T, D, BrdRowReplicate>,
|
||||
row_filter::caller< 5, T, D, BrdRowReplicate>,
|
||||
row_filter::caller< 6, T, D, BrdRowReplicate>,
|
||||
row_filter::caller< 7, T, D, BrdRowReplicate>,
|
||||
row_filter::caller< 8, T, D, BrdRowReplicate>,
|
||||
row_filter::caller< 9, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<10, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<11, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<12, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<13, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<14, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<15, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<16, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<17, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<18, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<19, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<20, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<21, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<22, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<23, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<24, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<25, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<26, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<27, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<28, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<29, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<30, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<31, T, D, BrdRowReplicate>,
|
||||
row_filter::caller<32, T, D, BrdRowReplicate>
|
||||
},
|
||||
{
|
||||
0,
|
||||
row_filter::caller< 1, T, D, BrdRowConstant>,
|
||||
row_filter::caller< 2, T, D, BrdRowConstant>,
|
||||
row_filter::caller< 3, T, D, BrdRowConstant>,
|
||||
row_filter::caller< 4, T, D, BrdRowConstant>,
|
||||
row_filter::caller< 5, T, D, BrdRowConstant>,
|
||||
row_filter::caller< 6, T, D, BrdRowConstant>,
|
||||
row_filter::caller< 7, T, D, BrdRowConstant>,
|
||||
row_filter::caller< 8, T, D, BrdRowConstant>,
|
||||
row_filter::caller< 9, T, D, BrdRowConstant>,
|
||||
row_filter::caller<10, T, D, BrdRowConstant>,
|
||||
row_filter::caller<11, T, D, BrdRowConstant>,
|
||||
row_filter::caller<12, T, D, BrdRowConstant>,
|
||||
row_filter::caller<13, T, D, BrdRowConstant>,
|
||||
row_filter::caller<14, T, D, BrdRowConstant>,
|
||||
row_filter::caller<15, T, D, BrdRowConstant>,
|
||||
row_filter::caller<16, T, D, BrdRowConstant>,
|
||||
row_filter::caller<17, T, D, BrdRowConstant>,
|
||||
row_filter::caller<18, T, D, BrdRowConstant>,
|
||||
row_filter::caller<19, T, D, BrdRowConstant>,
|
||||
row_filter::caller<20, T, D, BrdRowConstant>,
|
||||
row_filter::caller<21, T, D, BrdRowConstant>,
|
||||
row_filter::caller<22, T, D, BrdRowConstant>,
|
||||
row_filter::caller<23, T, D, BrdRowConstant>,
|
||||
row_filter::caller<24, T, D, BrdRowConstant>,
|
||||
row_filter::caller<25, T, D, BrdRowConstant>,
|
||||
row_filter::caller<26, T, D, BrdRowConstant>,
|
||||
row_filter::caller<27, T, D, BrdRowConstant>,
|
||||
row_filter::caller<28, T, D, BrdRowConstant>,
|
||||
row_filter::caller<29, T, D, BrdRowConstant>,
|
||||
row_filter::caller<30, T, D, BrdRowConstant>,
|
||||
row_filter::caller<31, T, D, BrdRowConstant>,
|
||||
row_filter::caller<32, T, D, BrdRowConstant>
|
||||
},
|
||||
{
|
||||
0,
|
||||
row_filter::caller< 1, T, D, BrdRowReflect>,
|
||||
row_filter::caller< 2, T, D, BrdRowReflect>,
|
||||
row_filter::caller< 3, T, D, BrdRowReflect>,
|
||||
row_filter::caller< 4, T, D, BrdRowReflect>,
|
||||
row_filter::caller< 5, T, D, BrdRowReflect>,
|
||||
row_filter::caller< 6, T, D, BrdRowReflect>,
|
||||
row_filter::caller< 7, T, D, BrdRowReflect>,
|
||||
row_filter::caller< 8, T, D, BrdRowReflect>,
|
||||
row_filter::caller< 9, T, D, BrdRowReflect>,
|
||||
row_filter::caller<10, T, D, BrdRowReflect>,
|
||||
row_filter::caller<11, T, D, BrdRowReflect>,
|
||||
row_filter::caller<12, T, D, BrdRowReflect>,
|
||||
row_filter::caller<13, T, D, BrdRowReflect>,
|
||||
row_filter::caller<14, T, D, BrdRowReflect>,
|
||||
row_filter::caller<15, T, D, BrdRowReflect>,
|
||||
row_filter::caller<16, T, D, BrdRowReflect>,
|
||||
row_filter::caller<17, T, D, BrdRowReflect>,
|
||||
row_filter::caller<18, T, D, BrdRowReflect>,
|
||||
row_filter::caller<19, T, D, BrdRowReflect>,
|
||||
row_filter::caller<20, T, D, BrdRowReflect>,
|
||||
row_filter::caller<21, T, D, BrdRowReflect>,
|
||||
row_filter::caller<22, T, D, BrdRowReflect>,
|
||||
row_filter::caller<23, T, D, BrdRowReflect>,
|
||||
row_filter::caller<24, T, D, BrdRowReflect>,
|
||||
row_filter::caller<25, T, D, BrdRowReflect>,
|
||||
row_filter::caller<26, T, D, BrdRowReflect>,
|
||||
row_filter::caller<27, T, D, BrdRowReflect>,
|
||||
row_filter::caller<28, T, D, BrdRowReflect>,
|
||||
row_filter::caller<29, T, D, BrdRowReflect>,
|
||||
row_filter::caller<30, T, D, BrdRowReflect>,
|
||||
row_filter::caller<31, T, D, BrdRowReflect>,
|
||||
row_filter::caller<32, T, D, BrdRowReflect>
|
||||
},
|
||||
{
|
||||
0,
|
||||
row_filter::caller< 1, T, D, BrdRowWrap>,
|
||||
row_filter::caller< 2, T, D, BrdRowWrap>,
|
||||
row_filter::caller< 3, T, D, BrdRowWrap>,
|
||||
row_filter::caller< 4, T, D, BrdRowWrap>,
|
||||
row_filter::caller< 5, T, D, BrdRowWrap>,
|
||||
row_filter::caller< 6, T, D, BrdRowWrap>,
|
||||
row_filter::caller< 7, T, D, BrdRowWrap>,
|
||||
row_filter::caller< 8, T, D, BrdRowWrap>,
|
||||
row_filter::caller< 9, T, D, BrdRowWrap>,
|
||||
row_filter::caller<10, T, D, BrdRowWrap>,
|
||||
row_filter::caller<11, T, D, BrdRowWrap>,
|
||||
row_filter::caller<12, T, D, BrdRowWrap>,
|
||||
row_filter::caller<13, T, D, BrdRowWrap>,
|
||||
row_filter::caller<14, T, D, BrdRowWrap>,
|
||||
row_filter::caller<15, T, D, BrdRowWrap>,
|
||||
row_filter::caller<16, T, D, BrdRowWrap>,
|
||||
row_filter::caller<17, T, D, BrdRowWrap>,
|
||||
row_filter::caller<18, T, D, BrdRowWrap>,
|
||||
row_filter::caller<19, T, D, BrdRowWrap>,
|
||||
row_filter::caller<20, T, D, BrdRowWrap>,
|
||||
row_filter::caller<21, T, D, BrdRowWrap>,
|
||||
row_filter::caller<22, T, D, BrdRowWrap>,
|
||||
row_filter::caller<23, T, D, BrdRowWrap>,
|
||||
row_filter::caller<24, T, D, BrdRowWrap>,
|
||||
row_filter::caller<25, T, D, BrdRowWrap>,
|
||||
row_filter::caller<26, T, D, BrdRowWrap>,
|
||||
row_filter::caller<27, T, D, BrdRowWrap>,
|
||||
row_filter::caller<28, T, D, BrdRowWrap>,
|
||||
row_filter::caller<29, T, D, BrdRowWrap>,
|
||||
row_filter::caller<30, T, D, BrdRowWrap>,
|
||||
row_filter::caller<31, T, D, BrdRowWrap>,
|
||||
row_filter::caller<32, T, D, BrdRowWrap>
|
||||
}
|
||||
};
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
|
||||
else
|
||||
cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
|
||||
|
||||
callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
|
||||
}
|
||||
}
|
@ -454,7 +454,7 @@ namespace cv { namespace gpu { namespace device
|
||||
grid.x = divUp(cols, threads.x << 1);
|
||||
grid.y = divUp(rows, threads.y);
|
||||
|
||||
int elem_step = u.step/sizeof(T);
|
||||
int elem_step = (int)(u.step / sizeof(T));
|
||||
|
||||
for(int t = 0; t < iters; ++t)
|
||||
{
|
||||
|
@ -42,9 +42,11 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/reduce.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@ -297,28 +299,13 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
|
||||
extern __shared__ float smem[];
|
||||
float* dline = smem + winsz * threadIdx.z;
|
||||
|
||||
dline[tid] = val;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
|
||||
if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }
|
||||
|
||||
volatile float* vdline = smem + winsz * threadIdx.z;
|
||||
|
||||
if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
|
||||
if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
|
||||
if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8];
|
||||
if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4];
|
||||
if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2];
|
||||
if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1];
|
||||
reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
|
||||
|
||||
T* data_cost = (T*)ctemp + y_out * cmsg_step + x_out;
|
||||
|
||||
if (tid == 0)
|
||||
data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
|
||||
data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
|
||||
}
|
||||
}
|
||||
|
||||
@ -496,26 +483,11 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
|
||||
extern __shared__ float smem[];
|
||||
float* dline = smem + winsz * threadIdx.z;
|
||||
|
||||
dline[tid] = val;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
|
||||
if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }
|
||||
|
||||
volatile float* vdline = smem + winsz * threadIdx.z;
|
||||
|
||||
if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
|
||||
if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
|
||||
if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8];
|
||||
if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4];
|
||||
if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2];
|
||||
if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1];
|
||||
reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
|
||||
|
||||
if (tid == 0)
|
||||
data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
|
||||
data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -47,13 +47,13 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/reduce.hpp"
|
||||
#include "opencv2/gpu/device/utility.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
#include "opencv2/gpu/device/filters.hpp"
|
||||
#include <float.h>
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@ -568,7 +568,9 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
float bestx = 0, besty = 0, best_mod = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 200
|
||||
#pragma unroll
|
||||
#endif
|
||||
for (int i = 0; i < 18; ++i)
|
||||
{
|
||||
const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
|
||||
@ -599,8 +601,9 @@ namespace cv { namespace gpu { namespace device
|
||||
sumy += s_Y[threadIdx.x + 96];
|
||||
}
|
||||
|
||||
device::reduce<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
|
||||
device::reduce<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
|
||||
plus<float> op;
|
||||
device::reduce<32>(smem_tuple(s_sumx + threadIdx.y * 32, s_sumy + threadIdx.y * 32),
|
||||
thrust::tie(sumx, sumy), threadIdx.x, thrust::make_tuple(op, op));
|
||||
|
||||
const float temp_mod = sumx * sumx + sumy * sumy;
|
||||
if (temp_mod > best_mod)
|
||||
@ -638,7 +641,7 @@ namespace cv { namespace gpu { namespace device
|
||||
kp_dir *= 180.0f / CV_PI_F;
|
||||
|
||||
kp_dir = 360.0f - kp_dir;
|
||||
if (abs(kp_dir - 360.f) < FLT_EPSILON)
|
||||
if (::fabsf(kp_dir - 360.f) < numeric_limits<float>::epsilon())
|
||||
kp_dir = 0.f;
|
||||
|
||||
featureDir[blockIdx.x] = kp_dir;
|
||||
@ -697,11 +700,6 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
typedef uchar elem_type;
|
||||
|
||||
__device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) :
|
||||
centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)
|
||||
{
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uchar operator ()(int i, int j) const
|
||||
{
|
||||
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
|
||||
@ -715,285 +713,215 @@ namespace cv { namespace gpu { namespace device
|
||||
float win_offset;
|
||||
float cos_dir;
|
||||
float sin_dir;
|
||||
int width;
|
||||
int height;
|
||||
};
|
||||
|
||||
__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25],
|
||||
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
|
||||
__device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
|
||||
float& dx, float& dy)
|
||||
{
|
||||
__shared__ float s_PATCH[6][6];
|
||||
__shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];
|
||||
|
||||
const float centerX = featureX[blockIdx.x];
|
||||
const float centerY = featureY[blockIdx.x];
|
||||
const float size = featureSize[blockIdx.x];
|
||||
float descriptor_dir = 360.0f - featureDir[blockIdx.x];
|
||||
if (std::abs(descriptor_dir - 360.f) < FLT_EPSILON)
|
||||
descriptor_dir = 0.f;
|
||||
descriptor_dir *= (float)(CV_PI_F / 180.0f);
|
||||
dx = dy = 0.0f;
|
||||
|
||||
/* The sampling intervals and wavelet sized for selecting an orientation
|
||||
and building the keypoint descriptor are defined relative to 's' */
|
||||
const float s = size * 1.2f / 9.0f;
|
||||
WinReader win;
|
||||
|
||||
/* Extract a window of pixels around the keypoint of size 20s */
|
||||
win.centerX = featureX[blockIdx.x];
|
||||
win.centerY = featureY[blockIdx.x];
|
||||
|
||||
// The sampling intervals and wavelet sized for selecting an orientation
|
||||
// and building the keypoint descriptor are defined relative to 's'
|
||||
const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
|
||||
|
||||
// Extract a window of pixels around the keypoint of size 20s
|
||||
const int win_size = (int)((PATCH_SZ + 1) * s);
|
||||
|
||||
float sin_dir;
|
||||
float cos_dir;
|
||||
sincosf(descriptor_dir, &sin_dir, &cos_dir);
|
||||
win.width = win.height = win_size;
|
||||
|
||||
/* Nearest neighbour version (faster) */
|
||||
const float win_offset = -(float)(win_size - 1) / 2;
|
||||
|
||||
// Compute sampling points
|
||||
// since grids are 2D, need to compute xBlock and yBlock indices
|
||||
const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4
|
||||
const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
|
||||
const int xIndex = xBlock * 5 + threadIdx.x;
|
||||
const int yIndex = yBlock * 5 + threadIdx.y;
|
||||
|
||||
const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
|
||||
const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
|
||||
|
||||
LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));
|
||||
|
||||
s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < 5 && threadIdx.y < 5)
|
||||
{
|
||||
const int tid = threadIdx.y * 5 + threadIdx.x;
|
||||
|
||||
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
|
||||
|
||||
const float vx = (s_PATCH[threadIdx.y ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x ]) * dw;
|
||||
const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y ][threadIdx.x + 1]) * dw;
|
||||
|
||||
s_dx_bin[tid] = vx;
|
||||
s_dy_bin[tid] = vy;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)
|
||||
{
|
||||
// first step is to reduce from 25 to 16
|
||||
if (tid < 9) // use 9 threads
|
||||
{
|
||||
sdata1[tid] += sdata1[tid + 16];
|
||||
sdata2[tid] += sdata2[tid + 16];
|
||||
sdata3[tid] += sdata3[tid + 16];
|
||||
sdata4[tid] += sdata4[tid + 16];
|
||||
}
|
||||
|
||||
// sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
|
||||
if (tid < 8)
|
||||
{
|
||||
sdata1[tid] += sdata1[tid + 8];
|
||||
sdata1[tid] += sdata1[tid + 4];
|
||||
sdata1[tid] += sdata1[tid + 2];
|
||||
sdata1[tid] += sdata1[tid + 1];
|
||||
|
||||
sdata2[tid] += sdata2[tid + 8];
|
||||
sdata2[tid] += sdata2[tid + 4];
|
||||
sdata2[tid] += sdata2[tid + 2];
|
||||
sdata2[tid] += sdata2[tid + 1];
|
||||
|
||||
sdata3[tid] += sdata3[tid + 8];
|
||||
sdata3[tid] += sdata3[tid + 4];
|
||||
sdata3[tid] += sdata3[tid + 2];
|
||||
sdata3[tid] += sdata3[tid + 1];
|
||||
|
||||
sdata4[tid] += sdata4[tid + 8];
|
||||
sdata4[tid] += sdata4[tid + 4];
|
||||
sdata4[tid] += sdata4[tid + 2];
|
||||
sdata4[tid] += sdata4[tid + 1];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
|
||||
{
|
||||
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
|
||||
__shared__ float sdx[25];
|
||||
__shared__ float sdy[25];
|
||||
__shared__ float sdxabs[25];
|
||||
__shared__ float sdyabs[25];
|
||||
|
||||
calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
|
||||
__syncthreads();
|
||||
// Nearest neighbour version (faster)
|
||||
win.win_offset = -(win_size - 1.0f) / 2.0f;
|
||||
|
||||
float descriptor_dir = 360.0f - featureDir[blockIdx.x];
|
||||
if (::fabsf(descriptor_dir - 360.f) < numeric_limits<float>::epsilon())
|
||||
descriptor_dir = 0.f;
|
||||
descriptor_dir *= CV_PI_F / 180.0f;
|
||||
sincosf(descriptor_dir, &win.sin_dir, &win.cos_dir);
|
||||
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
if (tid < 25)
|
||||
const int xLoadInd = tid % (PATCH_SZ + 1);
|
||||
const int yLoadInd = tid / (PATCH_SZ + 1);
|
||||
|
||||
if (yLoadInd < (PATCH_SZ + 1))
|
||||
{
|
||||
sdxabs[tid] = ::fabs(sdx[tid]); // |dx| array
|
||||
sdyabs[tid] = ::fabs(sdy[tid]); // |dy| array
|
||||
__syncthreads();
|
||||
|
||||
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
|
||||
__syncthreads();
|
||||
|
||||
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);
|
||||
|
||||
// write dx, dy, |dx|, |dy|
|
||||
if (tid == 0)
|
||||
if (s > 1)
|
||||
{
|
||||
descriptors_block[0] = sdx[0];
|
||||
descriptors_block[1] = sdy[0];
|
||||
descriptors_block[2] = sdxabs[0];
|
||||
descriptors_block[3] = sdyabs[0];
|
||||
AreaFilter<WinReader> filter(win, s, s);
|
||||
s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd, xLoadInd);
|
||||
}
|
||||
else
|
||||
{
|
||||
LinearFilter<WinReader> filter(win);
|
||||
s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd * s, xLoadInd * s);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int xPatchInd = threadIdx.x % 5;
|
||||
const int yPatchInd = threadIdx.x / 5;
|
||||
|
||||
if (yPatchInd < 5)
|
||||
{
|
||||
const int xBlockInd = threadIdx.y % 4;
|
||||
const int yBlockInd = threadIdx.y / 4;
|
||||
|
||||
const int xInd = xBlockInd * 5 + xPatchInd;
|
||||
const int yInd = yBlockInd * 5 + yPatchInd;
|
||||
|
||||
const float dw = c_DW[yInd * PATCH_SZ + xInd];
|
||||
|
||||
dx = (s_PATCH[yInd ][xInd + 1] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd + 1][xInd ]) * dw;
|
||||
dy = (s_PATCH[yInd + 1][xInd ] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd ][xInd + 1]) * dw;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
|
||||
__global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
|
||||
{
|
||||
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
|
||||
__shared__ float sdx[25];
|
||||
__shared__ float sdy[25];
|
||||
__shared__ float smem[32 * 16];
|
||||
|
||||
// sum (reduce) 5x5 area response
|
||||
__shared__ float sd1[25];
|
||||
__shared__ float sd2[25];
|
||||
__shared__ float sdabs1[25];
|
||||
__shared__ float sdabs2[25];
|
||||
float* sRow = smem + threadIdx.y * 32;
|
||||
|
||||
calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
|
||||
__syncthreads();
|
||||
float dx, dy;
|
||||
calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
|
||||
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
float dxabs = ::fabsf(dx);
|
||||
float dyabs = ::fabsf(dy);
|
||||
|
||||
if (tid < 25)
|
||||
plus<float> op;
|
||||
|
||||
reduce<32>(sRow, dx, threadIdx.x, op);
|
||||
reduce<32>(sRow, dy, threadIdx.x, op);
|
||||
reduce<32>(sRow, dxabs, threadIdx.x, op);
|
||||
reduce<32>(sRow, dyabs, threadIdx.x, op);
|
||||
|
||||
float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y;
|
||||
|
||||
// write dx, dy, |dx|, |dy|
|
||||
if (threadIdx.x == 0)
|
||||
*descriptors_block = make_float4(dx, dy, dxabs, dyabs);
|
||||
}
|
||||
|
||||
__global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
|
||||
{
|
||||
__shared__ float smem[32 * 16];
|
||||
|
||||
float* sRow = smem + threadIdx.y * 32;
|
||||
|
||||
float dx, dy;
|
||||
calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
|
||||
|
||||
float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;
|
||||
|
||||
plus<float> op;
|
||||
|
||||
float d1 = 0.0f;
|
||||
float d2 = 0.0f;
|
||||
float abs1 = 0.0f;
|
||||
float abs2 = 0.0f;
|
||||
|
||||
if (dy >= 0)
|
||||
{
|
||||
if (sdy[tid] >= 0)
|
||||
{
|
||||
sd1[tid] = sdx[tid];
|
||||
sdabs1[tid] = ::fabs(sdx[tid]);
|
||||
sd2[tid] = 0;
|
||||
sdabs2[tid] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
sd1[tid] = 0;
|
||||
sdabs1[tid] = 0;
|
||||
sd2[tid] = sdx[tid];
|
||||
sdabs2[tid] = ::fabs(sdx[tid]);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
|
||||
__syncthreads();
|
||||
|
||||
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
|
||||
|
||||
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
|
||||
if (tid == 0)
|
||||
{
|
||||
descriptors_block[0] = sd1[0];
|
||||
descriptors_block[1] = sdabs1[0];
|
||||
descriptors_block[2] = sd2[0];
|
||||
descriptors_block[3] = sdabs2[0];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (sdx[tid] >= 0)
|
||||
{
|
||||
sd1[tid] = sdy[tid];
|
||||
sdabs1[tid] = ::fabs(sdy[tid]);
|
||||
sd2[tid] = 0;
|
||||
sdabs2[tid] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
sd1[tid] = 0;
|
||||
sdabs1[tid] = 0;
|
||||
sd2[tid] = sdy[tid];
|
||||
sdabs2[tid] = ::fabs(sdy[tid]);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
|
||||
__syncthreads();
|
||||
|
||||
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
|
||||
if (tid == 0)
|
||||
{
|
||||
descriptors_block[4] = sd1[0];
|
||||
descriptors_block[5] = sdabs1[0];
|
||||
descriptors_block[6] = sd2[0];
|
||||
descriptors_block[7] = sdabs2[0];
|
||||
}
|
||||
d1 = dx;
|
||||
abs1 = ::fabsf(dx);
|
||||
}
|
||||
else
|
||||
{
|
||||
d2 = dx;
|
||||
abs2 = ::fabsf(dx);
|
||||
}
|
||||
|
||||
reduce<32>(sRow, d1, threadIdx.x, op);
|
||||
reduce<32>(sRow, d2, threadIdx.x, op);
|
||||
reduce<32>(sRow, abs1, threadIdx.x, op);
|
||||
reduce<32>(sRow, abs2, threadIdx.x, op);
|
||||
|
||||
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
|
||||
if (threadIdx.x == 0)
|
||||
descriptors_block[0] = make_float4(d1, abs1, d2, abs2);
|
||||
|
||||
if (dx >= 0)
|
||||
{
|
||||
d1 = dy;
|
||||
abs1 = ::fabsf(dy);
|
||||
d2 = 0.0f;
|
||||
abs2 = 0.0f;
|
||||
}
|
||||
else
|
||||
{
|
||||
d1 = 0.0f;
|
||||
abs1 = 0.0f;
|
||||
d2 = dy;
|
||||
abs2 = ::fabsf(dy);
|
||||
}
|
||||
|
||||
reduce<32>(sRow, d1, threadIdx.x, op);
|
||||
reduce<32>(sRow, d2, threadIdx.x, op);
|
||||
reduce<32>(sRow, abs1, threadIdx.x, op);
|
||||
reduce<32>(sRow, abs2, threadIdx.x, op);
|
||||
|
||||
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
|
||||
if (threadIdx.x == 0)
|
||||
descriptors_block[1] = make_float4(d1, abs1, d2, abs2);
|
||||
}
|
||||
|
||||
template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
|
||||
{
|
||||
__shared__ float smem[BLOCK_DIM_X];
|
||||
__shared__ float s_len;
|
||||
|
||||
// no need for thread ID
|
||||
float* descriptor_base = descriptors.ptr(blockIdx.x);
|
||||
|
||||
// read in the unnormalized descriptor values (squared)
|
||||
__shared__ float sqDesc[BLOCK_DIM_X];
|
||||
const float lookup = descriptor_base[threadIdx.x];
|
||||
sqDesc[threadIdx.x] = lookup * lookup;
|
||||
__syncthreads();
|
||||
const float val = descriptor_base[threadIdx.x];
|
||||
|
||||
if (BLOCK_DIM_X >= 128)
|
||||
{
|
||||
if (threadIdx.x < 64)
|
||||
sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
|
||||
__syncthreads();
|
||||
}
|
||||
float len = val * val;
|
||||
reduce<BLOCK_DIM_X>(smem, len, threadIdx.x, plus<float>());
|
||||
|
||||
// reduction to get total
|
||||
if (threadIdx.x < 32)
|
||||
{
|
||||
volatile float* smem = sqDesc;
|
||||
|
||||
smem[threadIdx.x] += smem[threadIdx.x + 32];
|
||||
smem[threadIdx.x] += smem[threadIdx.x + 16];
|
||||
smem[threadIdx.x] += smem[threadIdx.x + 8];
|
||||
smem[threadIdx.x] += smem[threadIdx.x + 4];
|
||||
smem[threadIdx.x] += smem[threadIdx.x + 2];
|
||||
smem[threadIdx.x] += smem[threadIdx.x + 1];
|
||||
}
|
||||
|
||||
// compute length (square root)
|
||||
__shared__ float len;
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
len = sqrtf(sqDesc[0]);
|
||||
}
|
||||
s_len = ::sqrtf(len);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// normalize and store in output
|
||||
descriptor_base[threadIdx.x] = lookup / len;
|
||||
descriptor_base[threadIdx.x] = val / s_len;
|
||||
}
|
||||
|
||||
void compute_descriptors_gpu(const PtrStepSzf& descriptors,
|
||||
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
|
||||
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
|
||||
{
|
||||
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
|
||||
|
||||
if (descriptors.cols == 64)
|
||||
{
|
||||
compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
|
||||
compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
|
||||
normalize_descriptors<64><<<nFeatures, 64>>>((PtrStepSzf) descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
else
|
||||
{
|
||||
compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
|
||||
compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
|
||||
normalize_descriptors<128><<<nFeatures, 128>>>((PtrStepSzf) descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
@ -85,7 +85,7 @@ namespace cv
|
||||
|
||||
namespace device
|
||||
{
|
||||
using pcl::gpu::TextureBinder;
|
||||
using cv::gpu::TextureBinder;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
|
||||
{
|
||||
static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
|
||||
static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
|
||||
|
||||
@ -158,7 +158,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
|
||||
{
|
||||
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, int)
|
||||
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, bool)
|
||||
{
|
||||
(void)xoff;
|
||||
(void)yoff;
|
||||
@ -195,10 +195,10 @@ namespace cv { namespace gpu { namespace device
|
||||
}; \
|
||||
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
|
||||
{ \
|
||||
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, int cc) \
|
||||
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, bool cc20) \
|
||||
{ \
|
||||
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
|
||||
dim3 block(32, cc >= 20 ? 8 : 4); \
|
||||
dim3 block(32, cc20 ? 8 : 4); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
bindTexture(&tex_warp_ ## type , srcWhole); \
|
||||
tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
|
||||
@ -212,7 +212,7 @@ namespace cv { namespace gpu { namespace device
|
||||
}; \
|
||||
template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
|
||||
{ \
|
||||
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, int) \
|
||||
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, bool) \
|
||||
{ \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
@ -263,20 +263,20 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
|
||||
{
|
||||
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
|
||||
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
|
||||
{
|
||||
if (stream == 0)
|
||||
WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc);
|
||||
WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc20);
|
||||
else
|
||||
WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc);
|
||||
WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc20);
|
||||
}
|
||||
};
|
||||
|
||||
template <class Transform, typename T>
|
||||
void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
|
||||
int borderMode, const float* borderValue, cudaStream_t stream, int cc)
|
||||
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
|
||||
typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
static const func_t funcs[3][5] =
|
||||
{
|
||||
@ -304,84 +304,84 @@ namespace cv { namespace gpu { namespace device
|
||||
};
|
||||
|
||||
funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
|
||||
static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
|
||||
static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
|
||||
}
|
||||
|
||||
template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
|
||||
int borderMode, const float* borderValue, cudaStream_t stream, int cc)
|
||||
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
|
||||
|
||||
warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
|
||||
warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
|
||||
}
|
||||
|
||||
template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
//template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
//template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
|
||||
int borderMode, const float* borderValue, cudaStream_t stream, int cc)
|
||||
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
|
||||
|
||||
warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
|
||||
warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
|
||||
}
|
||||
|
||||
template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
//template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
//template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
|
||||
template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
//template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
|
||||
} // namespace imgproc
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
#include "cuvid_video_source.h"
|
||||
#include "cu_safe_call.h"
|
||||
|
||||
#if defined(HAVE_CUDA) && !defined(__APPLE__)
|
||||
#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
|
||||
|
||||
cv::gpu::detail::CuvidVideoSource::CuvidVideoSource(const std::string& fname)
|
||||
{
|
||||
|
@ -45,7 +45,7 @@
|
||||
|
||||
#include "precomp.hpp"
|
||||
|
||||
#if defined(HAVE_CUDA) && !defined(__APPLE__)
|
||||
#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
|
||||
|
||||
namespace cv { namespace gpu
|
||||
{
|
||||
|
@ -45,15 +45,19 @@
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
|
||||
void name(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
|
||||
#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
|
||||
void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name) \
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u) \
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
|
||||
|
||||
#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name) \
|
||||
#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name) \
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
|
||||
|
||||
#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(name) \
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) \
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \
|
||||
@ -152,46 +156,119 @@ namespace cv { namespace gpu { namespace device
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab4)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab4)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgra)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgra)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv4)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv4)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgra)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgra)
|
||||
|
||||
#undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
|
||||
#undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
|
||||
#undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
|
||||
#undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL
|
||||
}}}
|
||||
|
||||
#endif
|
@ -176,28 +176,11 @@ void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat
|
||||
|
||||
void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window, int block_window, Stream& s)
|
||||
{
|
||||
#if (CUDA_VERSION < 5000)
|
||||
(void)src;
|
||||
(void)dst;
|
||||
(void)h_luminance;
|
||||
(void)h_color;
|
||||
(void)search_window;
|
||||
(void)block_window;
|
||||
(void)s;
|
||||
|
||||
CV_Error( CV_GpuApiCallError, "Lab method required CUDA 5.0 and higher" );
|
||||
#else
|
||||
|
||||
|
||||
CV_Assert(src.type() == CV_8UC3);
|
||||
|
||||
lab.create(src.size(), src.type());
|
||||
cv::gpu::cvtColor(src, lab, CV_BGR2Lab, 0, s);
|
||||
|
||||
/*Mat t;
|
||||
cv::cvtColor(Mat(src), t, CV_BGR2Lab);
|
||||
lab.upload(t);*/
|
||||
|
||||
l.create(src.size(), CV_8U);
|
||||
ab.create(src.size(), CV_8UC2);
|
||||
device::imgproc::fnlm_split_channels(lab, l, ab, StreamAccessor::getStream(s));
|
||||
@ -207,11 +190,6 @@ void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat&
|
||||
|
||||
device::imgproc::fnlm_merge_channels(l, ab, lab, StreamAccessor::getStream(s));
|
||||
cv::gpu::cvtColor(lab, dst, CV_Lab2BGR, 0, s);
|
||||
|
||||
/*cv::cvtColor(Mat(lab), t, CV_Lab2BGR);
|
||||
dst.upload(t);*/
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user