diff --git a/CMakeLists.txt b/CMakeLists.txt index 34ea764f8a..9b7f532ac5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -110,14 +110,15 @@ endif() # Optional 3rd party components # =================================================== -OCV_OPTION(WITH_1394 "Include IEEE1394 support" ON IF (UNIX AND NOT ANDROID AND NOT IOS) ) +OCV_OPTION(WITH_1394 "Include IEEE1394 support" ON IF (UNIX AND NOT ANDROID AND NOT IOS AND NOT CARMA) ) OCV_OPTION(WITH_AVFOUNDATION "Use AVFoundation for Video I/O" ON IF IOS) OCV_OPTION(WITH_CARBON "Use Carbon for UI instead of Cocoa" OFF IF APPLE ) -OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) ) OCV_OPTION(WITH_CUDA "Include NVidia Cuda Runtime support" ON IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) ) OCV_OPTION(WITH_CUFFT "Include NVidia Cuda Fast Fourier Transform (FFT) library support" ON IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) ) +OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) ) +OCV_OPTION(WITH_NVCUVID "Include NVidia Video Decoding library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS AND NOT APPLE) ) OCV_OPTION(WITH_EIGEN "Include Eigen2/Eigen3 support" ON) -OCV_OPTION(WITH_FFMPEG "Include FFMPEG support" ON IF (NOT ANDROID AND NOT IOS) ) +OCV_OPTION(WITH_FFMPEG "Include FFMPEG support" ON IF (NOT ANDROID AND NOT IOS)) OCV_OPTION(WITH_GSTREAMER "Include Gstreamer support" ON IF (UNIX AND NOT APPLE AND NOT ANDROID) ) OCV_OPTION(WITH_GTK "Include GTK support" ON IF (UNIX AND NOT APPLE AND NOT ANDROID) ) OCV_OPTION(WITH_IPP "Include Intel IPP support" OFF IF (MSVC OR X86 OR X86_64) ) @@ -139,9 +140,9 @@ OCV_OPTION(WITH_VIDEOINPUT "Build HighGUI with DirectShow support" ON OCV_OPTION(WITH_XIMEA "Include XIMEA cameras support" OFF IF (NOT ANDROID AND NOT APPLE) ) OCV_OPTION(WITH_XINE "Include Xine support (GPL)" OFF IF (UNIX AND NOT APPLE AND NOT ANDROID) ) OCV_OPTION(WITH_CLP "Include Clp support (EPL)" OFF) -OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" OFF IF (NOT ANDROID AND NOT IOS) ) -OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" OFF IF (NOT ANDROID AND NOT IOS) ) -OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" OFF IF (NOT ANDROID AND NOT IOS) ) +OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" OFF IF (NOT ANDROID AND NOT IOS AND NOT CARMA) ) +OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" OFF IF (NOT ANDROID AND NOT IOS AND NOT CARMA) ) +OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" OFF IF (NOT ANDROID AND NOT IOS AND NOT CARMA) ) # OpenCV build components @@ -160,12 +161,12 @@ OCV_OPTION(BUILD_ANDROID_SERVICE "Build OpenCV Manager for Google Play" OFF I OCV_OPTION(BUILD_ANDROID_PACKAGE "Build platform-specific package for Google Play" OFF IF ANDROID ) # 3rd party libs -OCV_OPTION(BUILD_ZLIB "Build zlib from source" WIN32 OR APPLE ) -OCV_OPTION(BUILD_TIFF "Build libtiff from source" WIN32 OR ANDROID OR APPLE ) -OCV_OPTION(BUILD_JASPER "Build libjasper from source" WIN32 OR ANDROID OR APPLE ) -OCV_OPTION(BUILD_JPEG "Build libjpeg from source" WIN32 OR ANDROID OR APPLE ) -OCV_OPTION(BUILD_PNG "Build libpng from source" WIN32 OR ANDROID OR APPLE ) -OCV_OPTION(BUILD_OPENEXR "Build openexr from source" WIN32 OR ANDROID OR APPLE ) +OCV_OPTION(BUILD_ZLIB "Build zlib from source" WIN32 OR APPLE OR CARMA ) +OCV_OPTION(BUILD_TIFF "Build libtiff from source" WIN32 OR ANDROID OR APPLE OR CARMA ) +OCV_OPTION(BUILD_JASPER "Build libjasper from source" WIN32 OR ANDROID OR APPLE OR CARMA ) +OCV_OPTION(BUILD_JPEG "Build libjpeg from source" WIN32 OR ANDROID OR APPLE OR CARMA ) +OCV_OPTION(BUILD_PNG "Build libpng from source" WIN32 OR ANDROID OR APPLE OR CARMA ) +OCV_OPTION(BUILD_OPENEXR "Build openexr from source" WIN32 OR ANDROID OR APPLE OR CARMA ) # OpenCV installation options diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake index c1cd83866b..d6d5f3a98a 100644 --- a/cmake/OpenCVDetectCUDA.cmake +++ b/cmake/OpenCVDetectCUDA.cmake @@ -3,17 +3,17 @@ if(${CMAKE_VERSION} VERSION_LESS "2.8.3") return() endif() -if (WIN32 AND NOT MSVC) +if(WIN32 AND NOT MSVC) message(STATUS "CUDA compilation is disabled (due to only Visual Studio compiler suppoted on your platform).") return() endif() -if (CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang") +if(CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang") message(STATUS "CUDA compilation is disabled (due to Clang unsuppoted on your platform).") return() endif() -find_package(CUDA 4.1) +find_package(CUDA 4.2) if(CUDA_FOUND) set(HAVE_CUDA 1) @@ -26,15 +26,20 @@ if(CUDA_FOUND) set(HAVE_CUBLAS 1) endif() - message(STATUS "CUDA detected: " ${CUDA_VERSION}) - - if(${CUDA_VERSION_STRING} VERSION_GREATER "4.1") - set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") - else() - set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") + if(WITH_NVCUVID) + find_cuda_helper_libs(nvcuvid) + set(HAVE_NVCUVID 1) endif() - set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") + message(STATUS "CUDA detected: " ${CUDA_VERSION}) + + if (CARMA) + set(CUDA_ARCH_BIN "3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") + set(CUDA_ARCH_PTX "3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") + else() + set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") + set(CUDA_ARCH_PTX "2.0 3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") + endif() string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}") string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}") @@ -72,11 +77,20 @@ if(CUDA_FOUND) # Tell NVCC to add PTX intermediate code for the specified architectures string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_PTX_NO_POINTS}") - foreach(ARCH IN LISTS ARCH_LIST) - set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=compute_${ARCH}) - set(OPENCV_CUDA_ARCH_PTX "${OPENCV_CUDA_ARCH_PTX} ${ARCH}") - set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}") - endforeach() + foreach(ARCH IN LISTS ARCH_LIST) + set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=compute_${ARCH}) + set(OPENCV_CUDA_ARCH_PTX "${OPENCV_CUDA_ARCH_PTX} ${ARCH}") + set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}") + endforeach() + + if(CARMA) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --target-cpu-architecture=ARM" ) + + if (CMAKE_VERSION VERSION_LESS 2.8.10) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -ccbin=${CMAKE_CXX_COMPILER}" ) + endif() + + endif() # These vars will be processed in other scripts set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA}) @@ -84,7 +98,7 @@ if(CUDA_FOUND) message(STATUS "CUDA NVCC target flags: ${CUDA_NVCC_FLAGS}") - OCV_OPTION(CUDA_FAST_MATH "Enable --use_fast_math for CUDA compiler " OFF) + OCV_OPTION(CUDA_FAST_MATH "Enable --use_fast_math for CUDA compiler " OFF) if(CUDA_FAST_MATH) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math) @@ -92,7 +106,6 @@ if(CUDA_FOUND) mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD CUDA_SDK_ROOT_DIR) - unset(CUDA_npp_LIBRARY CACHE) find_cuda_helper_libs(npp) macro(ocv_cuda_compile VAR) @@ -106,15 +119,15 @@ if(CUDA_FOUND) string(REPLACE "-ggdb3" "" ${var} "${${var}}") endforeach() - if (BUILD_SHARED_LIBS) + if(BUILD_SHARED_LIBS) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -DCVAPI_EXPORTS) endif() if(UNIX OR APPLE) - set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC) endif() if(APPLE) - set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only) endif() # disabled because of multiple warnings during building nvcc auto generated files diff --git a/cmake/templates/cvconfig.h.cmake b/cmake/templates/cvconfig.h.cmake index 37e092b66e..6d4d2184f9 100644 --- a/cmake/templates/cvconfig.h.cmake +++ b/cmake/templates/cvconfig.h.cmake @@ -172,21 +172,15 @@ /* NVidia Cuda Runtime API*/ #cmakedefine HAVE_CUDA -/* OpenCL Support */ -#cmakedefine HAVE_OPENCL - -/* AMD's OpenCL Fast Fourier Transform Library*/ -#cmakedefine HAVE_CLAMDFFT - -/* AMD's Basic Linear Algebra Subprograms Library*/ -#cmakedefine HAVE_CLAMDBLAS - /* NVidia Cuda Fast Fourier Transform (FFT) API*/ #cmakedefine HAVE_CUFFT /* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/ #cmakedefine HAVE_CUBLAS +/* NVidia Video Decoding API*/ +#cmakedefine HAVE_NVCUVID + /* Compile for 'real' NVIDIA GPU architectures */ #define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}" @@ -199,6 +193,15 @@ /* Create PTX or BIN for 1.0 compute capability */ #cmakedefine CUDA_ARCH_BIN_OR_PTX_10 +/* OpenCL Support */ +#cmakedefine HAVE_OPENCL + +/* AMD's OpenCL Fast Fourier Transform Library*/ +#cmakedefine HAVE_CLAMDFFT + +/* AMD's Basic Linear Algebra Subprograms Library*/ +#cmakedefine HAVE_CLAMDBLAS + /* VideoInput library */ #cmakedefine HAVE_VIDEOINPUT diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index cfa14cdcdb..4c5112e3f9 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -10,7 +10,6 @@ if(HAVE_CUDA) file(GLOB lib_cuda "src/cuda/*.cu") ocv_cuda_compile(cuda_objs ${lib_cuda}) - set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) else() set(lib_cuda "") diff --git a/modules/core/include/opencv2/core/core.hpp b/modules/core/include/opencv2/core/core.hpp index 60e2096b29..5b8ee63790 100644 --- a/modules/core/include/opencv2/core/core.hpp +++ b/modules/core/include/opencv2/core/core.hpp @@ -91,7 +91,7 @@ class SparseMat; typedef Mat MatND; class GlBuffer; -class GlTexture; +class GlTexture2D; class GlArrays; class GlCamera; @@ -1306,7 +1306,7 @@ public: STD_VECTOR_MAT = 5 << KIND_SHIFT, EXPR = 6 << KIND_SHIFT, OPENGL_BUFFER = 7 << KIND_SHIFT, - OPENGL_TEXTURE = 8 << KIND_SHIFT, + OPENGL_TEXTURE2D = 8 << KIND_SHIFT, GPU_MAT = 9 << KIND_SHIFT }; _InputArray(); @@ -1323,13 +1323,13 @@ public: _InputArray(const Scalar& s); _InputArray(const double& val); _InputArray(const GlBuffer& buf); - _InputArray(const GlTexture& tex); + _InputArray(const GlTexture2D& tex); _InputArray(const gpu::GpuMat& d_mat); virtual Mat getMat(int i=-1) const; virtual void getMatVector(vector& mv) const; virtual GlBuffer getGlBuffer() const; - virtual GlTexture getGlTexture() const; + virtual GlTexture2D getGlTexture2D() const; virtual gpu::GpuMat getGpuMat() const; virtual int kind() const; @@ -1380,6 +1380,8 @@ public: template _OutputArray(Matx<_Tp, m, n>& matx); template _OutputArray(_Tp* vec, int n); _OutputArray(gpu::GpuMat& d_mat); + _OutputArray(GlBuffer& buf); + _OutputArray(GlTexture2D& tex); _OutputArray(const Mat& m); template _OutputArray(const vector<_Tp>& vec); @@ -1390,12 +1392,16 @@ public: template _OutputArray(const Matx<_Tp, m, n>& matx); template _OutputArray(const _Tp* vec, int n); _OutputArray(const gpu::GpuMat& d_mat); + _OutputArray(const GlBuffer& buf); + _OutputArray(const GlTexture2D& tex); virtual bool fixedSize() const; virtual bool fixedType() const; virtual bool needed() const; virtual Mat& getMatRef(int i=-1) const; virtual gpu::GpuMat& getGpuMatRef() const; + virtual GlBuffer& getGlBufferRef() const; + virtual GlTexture2D& getGlTexture2DRef() const; virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const; virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const; virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const; diff --git a/modules/core/include/opencv2/core/cuda_devptrs.hpp b/modules/core/include/opencv2/core/cuda_devptrs.hpp index 6363e0dc45..26fc2403f9 100644 --- a/modules/core/include/opencv2/core/cuda_devptrs.hpp +++ b/modules/core/include/opencv2/core/cuda_devptrs.hpp @@ -152,6 +152,20 @@ namespace cv //#undef __CV_GPU_DEPR_BEFORE__ //#undef __CV_GPU_DEPR_AFTER__ + namespace device + { + using cv::gpu::PtrSz; + using cv::gpu::PtrStep; + using cv::gpu::PtrStepSz; + + using cv::gpu::PtrStepSzb; + using cv::gpu::PtrStepSzf; + using cv::gpu::PtrStepSzi; + + using cv::gpu::PtrStepb; + using cv::gpu::PtrStepf; + using cv::gpu::PtrStepi; + } } } diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index 2830a9e949..6bf4e5d21b 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -79,6 +79,8 @@ namespace cv { namespace gpu WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30 }; + CV_EXPORTS bool deviceSupports(FeatureSet feature_set); + // Gives information about what GPU archs this OpenCV GPU module was // compiled for class CV_EXPORTS TargetArchs @@ -545,22 +547,6 @@ namespace cv { namespace gpu ensureSizeIsEnough(size.height, size.width, type, m); } - inline void createContinuous(int rows, int cols, int type, GpuMat& m) - { - int area = rows * cols; - if (!m.isContinuous() || m.type() != type || m.size().area() != area) - ensureSizeIsEnough(1, area, type, m); - m = m.reshape(0, rows); - } - - inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m) - { - if (m.type() == type && m.rows >= rows && m.cols >= cols) - m = m(Rect(0, 0, cols, rows)); - else - m.create(rows, cols, type); - } - inline GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat &mat) { if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols) diff --git a/modules/core/include/opencv2/core/internal.hpp b/modules/core/include/opencv2/core/internal.hpp index 93e56c3ab3..c042ccaf1f 100644 --- a/modules/core/include/opencv2/core/internal.hpp +++ b/modules/core/include/opencv2/core/internal.hpp @@ -750,39 +750,4 @@ typedef struct CvBigFuncTable (tab).fn_2d[CV_32F] = (void*)FUNCNAME##_32f##FLAG; \ (tab).fn_2d[CV_64F] = (void*)FUNCNAME##_64f##FLAG -#ifdef __cplusplus -//! OpenGL extension table -class CV_EXPORTS CvOpenGlFuncTab -{ -public: - virtual ~CvOpenGlFuncTab(); - - virtual void genBuffers(int n, unsigned int* buffers) const = 0; - virtual void deleteBuffers(int n, const unsigned int* buffers) const = 0; - - virtual void bufferData(unsigned int target, ptrdiff_t size, const void* data, unsigned int usage) const = 0; - virtual void bufferSubData(unsigned int target, ptrdiff_t offset, ptrdiff_t size, const void* data) const = 0; - - virtual void bindBuffer(unsigned int target, unsigned int buffer) const = 0; - - virtual void* mapBuffer(unsigned int target, unsigned int access) const = 0; - virtual void unmapBuffer(unsigned int target) const = 0; - - virtual void generateBitmapFont(const std::string& family, int height, int weight, bool italic, bool underline, int start, int count, int base) const = 0; - - virtual bool isGlContextInitialized() const = 0; -}; - -CV_EXPORTS void icvSetOpenGlFuncTab(const CvOpenGlFuncTab* tab); - -CV_EXPORTS bool icvCheckGlError(const char* file, const int line, const char* func = ""); - -#if defined(__GNUC__) - #define CV_CheckGlError() CV_DbgAssert( (::icvCheckGlError(__FILE__, __LINE__, __func__)) ) -#else - #define CV_CheckGlError() CV_DbgAssert( (::icvCheckGlError(__FILE__, __LINE__)) ) -#endif - -#endif //__cplusplus - #endif // __OPENCV_CORE_INTERNAL_HPP__ diff --git a/modules/core/include/opencv2/core/opengl_interop.hpp b/modules/core/include/opencv2/core/opengl_interop.hpp index 0bd2e9fdcf..cfa84756c5 100644 --- a/modules/core/include/opencv2/core/opengl_interop.hpp +++ b/modules/core/include/opencv2/core/opengl_interop.hpp @@ -47,205 +47,212 @@ #include "opencv2/core/core.hpp" -namespace cv -{ +namespace cv { + +CV_EXPORTS bool checkGlError(const char* file, const int line, const char* func = ""); + +#if defined(__GNUC__) + #define CV_CheckGlError() CV_DbgAssert( (cv::checkGlError(__FILE__, __LINE__, __func__)) ) +#else + #define CV_CheckGlError() CV_DbgAssert( (cv::checkGlError(__FILE__, __LINE__)) ) +#endif + +/////////////////// OpenGL Objects /////////////////// + //! Smart pointer for OpenGL buffer memory with reference counting. class CV_EXPORTS GlBuffer { public: - enum Usage + enum Target { - ARRAY_BUFFER = 0x8892, // buffer will use for OpenGL arrays (vertices, colors, normals, etc) - TEXTURE_BUFFER = 0x88EC // buffer will ise for OpenGL textures + ARRAY_BUFFER = 0x8892, //!< The buffer will be used as a source for vertex data + ELEMENT_ARRAY_BUFFER = 0x8893, //!< The buffer will be used for indices (in glDrawElements, for example) + PIXEL_PACK_BUFFER = 0x88EB, //!< The buffer will be used for reading from OpenGL textures + PIXEL_UNPACK_BUFFER = 0x88EC //!< The buffer will be used for writing to OpenGL textures + }; + + enum Access + { + READ_ONLY = 0x88B8, + WRITE_ONLY = 0x88B9, + READ_WRITE = 0x88BA }; //! create empty buffer - explicit GlBuffer(Usage usage); + GlBuffer(); + + //! create buffer from existed buffer id + GlBuffer(int arows, int acols, int atype, unsigned int abufId, bool autoRelease = false); + GlBuffer(Size asize, int atype, unsigned int abufId, bool autoRelease = false); //! create buffer - GlBuffer(int rows, int cols, int type, Usage usage); - GlBuffer(Size size, int type, Usage usage); + GlBuffer(int arows, int acols, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false); + GlBuffer(Size asize, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false); //! copy from host/device memory - GlBuffer(InputArray mat, Usage usage); + explicit GlBuffer(InputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false); - void create(int rows, int cols, int type, Usage usage); - void create(Size size, int type, Usage usage); - void create(int rows, int cols, int type); - void create(Size size, int type); + //! create buffer + void create(int arows, int acols, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false); + void create(Size asize, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false) { create(asize.height, asize.width, atype, target, autoRelease); } + //! release memory and delete buffer object void release(); - //! copy from host/device memory - void copyFrom(InputArray mat); + //! set auto release mode (if true, release will be called in object's destructor) + void setAutoRelease(bool flag); - void bind() const; - void unbind() const; + //! copy from host/device memory + void copyFrom(InputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false); + + //! copy to host/device memory + void copyTo(OutputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false) const; + + //! create copy of current buffer + GlBuffer clone(Target target = ARRAY_BUFFER, bool autoRelease = false) const; + + //! bind buffer for specified target + void bind(Target target) const; + + //! unbind any buffers from specified target + static void unbind(Target target); //! map to host memory - Mat mapHost(); + Mat mapHost(Access access); void unmapHost(); //! map to device memory gpu::GpuMat mapDevice(); void unmapDevice(); - inline int rows() const { return rows_; } - inline int cols() const { return cols_; } - inline Size size() const { return Size(cols_, rows_); } - inline bool empty() const { return rows_ == 0 || cols_ == 0; } + int rows() const { return rows_; } + int cols() const { return cols_; } + Size size() const { return Size(cols_, rows_); } + bool empty() const { return rows_ == 0 || cols_ == 0; } - inline int type() const { return type_; } - inline int depth() const { return CV_MAT_DEPTH(type_); } - inline int channels() const { return CV_MAT_CN(type_); } - inline int elemSize() const { return CV_ELEM_SIZE(type_); } - inline int elemSize1() const { return CV_ELEM_SIZE1(type_); } + int type() const { return type_; } + int depth() const { return CV_MAT_DEPTH(type_); } + int channels() const { return CV_MAT_CN(type_); } + int elemSize() const { return CV_ELEM_SIZE(type_); } + int elemSize1() const { return CV_ELEM_SIZE1(type_); } - inline Usage usage() const { return usage_; } + unsigned int bufId() const; class Impl; + private: + Ptr impl_; int rows_; int cols_; int type_; - Usage usage_; - - Ptr impl_; }; template <> CV_EXPORTS void Ptr::delete_obj(); -//! Smart pointer for OpenGL 2d texture memory with reference counting. -class CV_EXPORTS GlTexture +//! Smart pointer for OpenGL 2D texture memory with reference counting. +class CV_EXPORTS GlTexture2D { public: + enum Format + { + NONE = 0, + DEPTH_COMPONENT = 0x1902, //!< Depth + RGB = 0x1907, //!< Red, Green, Blue + RGBA = 0x1908 //!< Red, Green, Blue, Alpha + }; + //! create empty texture - GlTexture(); + GlTexture2D(); + + //! create texture from existed texture id + GlTexture2D(int arows, int acols, Format aformat, unsigned int atexId, bool autoRelease = false); + GlTexture2D(Size asize, Format aformat, unsigned int atexId, bool autoRelease = false); //! create texture - GlTexture(int rows, int cols, int type); - GlTexture(Size size, int type); + GlTexture2D(int arows, int acols, Format aformat, bool autoRelease = false); + GlTexture2D(Size asize, Format aformat, bool autoRelease = false); //! copy from host/device memory - explicit GlTexture(InputArray mat, bool bgra = true); + explicit GlTexture2D(InputArray arr, bool autoRelease = false); - void create(int rows, int cols, int type); - void create(Size size, int type); + //! create texture + void create(int arows, int acols, Format aformat, bool autoRelease = false); + void create(Size asize, Format aformat, bool autoRelease = false) { create(asize.height, asize.width, aformat, autoRelease); } + + //! release memory and delete texture object void release(); + //! set auto release mode (if true, release will be called in object's destructor) + void setAutoRelease(bool flag); + //! copy from host/device memory - void copyFrom(InputArray mat, bool bgra = true); + void copyFrom(InputArray arr, bool autoRelease = false); + //! copy to host/device memory + void copyTo(OutputArray arr, int ddepth = CV_32F, bool autoRelease = false) const; + + //! bind texture to current active texture unit for GL_TEXTURE_2D target void bind() const; - void unbind() const; - inline int rows() const { return rows_; } - inline int cols() const { return cols_; } - inline Size size() const { return Size(cols_, rows_); } - inline bool empty() const { return rows_ == 0 || cols_ == 0; } + int rows() const { return rows_; } + int cols() const { return cols_; } + Size size() const { return Size(cols_, rows_); } + bool empty() const { return rows_ == 0 || cols_ == 0; } - inline int type() const { return type_; } - inline int depth() const { return CV_MAT_DEPTH(type_); } - inline int channels() const { return CV_MAT_CN(type_); } - inline int elemSize() const { return CV_ELEM_SIZE(type_); } - inline int elemSize1() const { return CV_ELEM_SIZE1(type_); } + Format format() const { return format_; } + + unsigned int texId() const; class Impl; + private: + Ptr impl_; int rows_; int cols_; - int type_; - - Ptr impl_; - GlBuffer buf_; + Format format_; }; -template <> CV_EXPORTS void Ptr::delete_obj(); +template <> CV_EXPORTS void Ptr::delete_obj(); //! OpenGL Arrays class CV_EXPORTS GlArrays { public: - inline GlArrays() - : vertex_(GlBuffer::ARRAY_BUFFER), color_(GlBuffer::ARRAY_BUFFER), bgra_(true), normal_(GlBuffer::ARRAY_BUFFER), texCoord_(GlBuffer::ARRAY_BUFFER) - { - } + GlArrays(); void setVertexArray(InputArray vertex); - inline void resetVertexArray() { vertex_.release(); } + void resetVertexArray(); - void setColorArray(InputArray color, bool bgra = true); - inline void resetColorArray() { color_.release(); } + void setColorArray(InputArray color); + void resetColorArray(); void setNormalArray(InputArray normal); - inline void resetNormalArray() { normal_.release(); } + void resetNormalArray(); void setTexCoordArray(InputArray texCoord); - inline void resetTexCoordArray() { texCoord_.release(); } + void resetTexCoordArray(); + + void release(); + + void setAutoRelease(bool flag); void bind() const; - void unbind() const; - inline int rows() const { return vertex_.rows(); } - inline int cols() const { return vertex_.cols(); } - inline Size size() const { return vertex_.size(); } - inline bool empty() const { return vertex_.empty(); } + int size() const { return size_; } + bool empty() const { return size_ == 0; } private: + int size_; GlBuffer vertex_; GlBuffer color_; - bool bgra_; GlBuffer normal_; GlBuffer texCoord_; }; -//! OpenGL Font -class CV_EXPORTS GlFont -{ -public: - enum Weight - { - WEIGHT_LIGHT = 300, - WEIGHT_NORMAL = 400, - WEIGHT_SEMIBOLD = 600, - WEIGHT_BOLD = 700, - WEIGHT_BLACK = 900 - }; - - enum Style - { - STYLE_NORMAL = 0, - STYLE_ITALIC = 1, - STYLE_UNDERLINE = 2 - }; - - static Ptr get(const std::string& family, int height = 12, Weight weight = WEIGHT_NORMAL, Style style = STYLE_NORMAL); - - void draw(const char* str, size_t len) const; - - inline const std::string& family() const { return family_; } - inline int height() const { return height_; } - inline Weight weight() const { return weight_; } - inline Style style() const { return style_; } - -private: - GlFont(const std::string& family, int height, Weight weight, Style style); - - std::string family_; - int height_; - Weight weight_; - Style style_; - - unsigned int base_; - - GlFont(const GlFont&); - GlFont& operator =(const GlFont&); -}; - -//! render functions +/////////////////// Render Functions /////////////////// //! render texture rectangle in window -CV_EXPORTS void render(const GlTexture& tex, +CV_EXPORTS void render(const GlTexture2D& tex, Rect_ wndRect = Rect_(0.0, 0.0, 1.0, 1.0), Rect_ texRect = Rect_(0.0, 0.0, 1.0, 1.0)); @@ -267,67 +274,13 @@ namespace RenderMode { //! render OpenGL arrays CV_EXPORTS void render(const GlArrays& arr, int mode = RenderMode::POINTS, Scalar color = Scalar::all(255)); +CV_EXPORTS void render(const GlArrays& arr, InputArray indices, int mode = RenderMode::POINTS, Scalar color = Scalar::all(255)); -CV_EXPORTS void render(const std::string& str, const Ptr& font, Scalar color, Point2d pos); - -//! OpenGL camera -class CV_EXPORTS GlCamera -{ -public: - GlCamera(); - - void lookAt(Point3d eye, Point3d center, Point3d up); - void setCameraPos(Point3d pos, double yaw, double pitch, double roll); - - void setScale(Point3d scale); - - void setProjectionMatrix(const Mat& projectionMatrix, bool transpose = true); - void setPerspectiveProjection(double fov, double aspect, double zNear, double zFar); - void setOrthoProjection(double left, double right, double bottom, double top, double zNear, double zFar); - - void setupProjectionMatrix() const; - void setupModelViewMatrix() const; - -private: - Point3d eye_; - Point3d center_; - Point3d up_; - - Point3d pos_; - double yaw_; - double pitch_; - double roll_; - - bool useLookAtParams_; - - Point3d scale_; - - Mat projectionMatrix_; - - double fov_; - double aspect_; - - double left_; - double right_; - double bottom_; - double top_; - - double zNear_; - double zFar_; - - bool perspectiveProjection_; -}; - -inline void GlBuffer::create(Size _size, int _type, Usage _usage) { create(_size.height, _size.width, _type, _usage); } -inline void GlBuffer::create(int _rows, int _cols, int _type) { create(_rows, _cols, _type, usage()); } -inline void GlBuffer::create(Size _size, int _type) { create(_size.height, _size.width, _type, usage()); } -inline void GlTexture::create(Size _size, int _type) { create(_size.height, _size.width, _type); } - -namespace gpu -{ +namespace gpu { //! set a CUDA device to use OpenGL interoperability CV_EXPORTS void setGlDevice(int device = 0); } + } // namespace cv #endif // __cplusplus diff --git a/modules/core/src/cuda/matrix_operations.cu b/modules/core/src/cuda/matrix_operations.cu index 9e830e563b..60aa073406 100644 --- a/modules/core/src/cuda/matrix_operations.cu +++ b/modules/core/src/cuda/matrix_operations.cu @@ -44,6 +44,7 @@ #include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/type_traits.hpp" namespace cv { namespace gpu { namespace device { @@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device void writeScalar(const int*); void writeScalar(const float*); void writeScalar(const double*); + void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream); void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t); }}} @@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device //////////////////////////////// ConvertTo //////////////////////////////// /////////////////////////////////////////////////////////////////////////// - template struct Convertor : unary_function + template struct Convertor : unary_function { - Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {} + Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {} - __device__ __forceinline__ D operator()(const T& src) const + __device__ __forceinline__ D operator()(typename TypeTraits::ParameterType src) const { return saturate_cast(alpha * src + beta); } - double alpha, beta; + S alpha, beta; }; namespace detail @@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device }; } - template struct TransformFunctorTraits< Convertor > : detail::ConvertTraits< Convertor > + template struct TransformFunctorTraits< Convertor > : detail::ConvertTraits< Convertor > { }; - template + template void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream) { cudaSafeCall( cudaSetDoubleForDevice(&alpha) ); cudaSafeCall( cudaSetDoubleForDevice(&beta) ); - Convertor op(alpha, beta); + Convertor op(static_cast(alpha), static_cast(beta)); cv::gpu::device::transform((PtrStepSz)src, (PtrStepSz)dst, op, WithOutMask(), stream); } @@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device { typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream); - static const caller_t tab[8][8] = + static const caller_t tab[7][7] = { - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, cvt_, 0}, - - {0,0,0,0,0,0,0,0} + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + } }; caller_t func = tab[sdepth][ddepth]; - if (!func) - cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu"); - func(src, dst, alpha, beta, stream); } diff --git a/modules/core/src/gl_core_3_1.cpp b/modules/core/src/gl_core_3_1.cpp new file mode 100644 index 0000000000..3bc74faa19 --- /dev/null +++ b/modules/core/src/gl_core_3_1.cpp @@ -0,0 +1,2718 @@ +#include +#include +#include "cvconfig.h" +#include "opencv2/core/core.hpp" +#include "gl_core_3_1.hpp" + +#ifdef HAVE_OPENGL + #if defined(__APPLE__) + #include + + static void* AppleGLGetProcAddress (const char* name) + { + static const struct mach_header* image = 0; + if (!image) + image = NSAddImage("/System/Library/Frameworks/OpenGL.framework/Versions/Current/OpenGL", NSADDIMAGE_OPTION_RETURN_ON_ERROR); + + // prepend a '_' for the Unix C symbol mangling convention + std::string symbolName = "_"; + symbolName += std::string(name); + + NSSymbol symbol = image ? NSLookupSymbolInImage(image, &symbolName[0], NSLOOKUPSYMBOLINIMAGE_OPTION_BIND | NSLOOKUPSYMBOLINIMAGE_OPTION_RETURN_ON_ERROR) : 0; + + return symbol ? NSAddressOfSymbol(symbol) : 0; + } + #endif // __APPLE__ + + #if defined(__sgi) || defined (__sun) + #include + #include + + static void* SunGetProcAddress (const char* name) + { + typedef void* (func_t*)(const GLubyte*); + + static void* h = 0; + static func_t gpa = 0; + + if (!h) + { + h = dlopen(NULL, RTLD_LAZY | RTLD_LOCAL); + if (!h) + return 0; + gpa = (func_t) dlsym(h, "glXGetProcAddress"); + } + + return gpa ? gpa((const GLubyte*) name) : dlsym(h, name); + } + #endif // __sgi || __sun + + #if defined(_WIN32) + #ifdef _MSC_VER + #pragma warning(disable: 4055) + #pragma warning(disable: 4054) + #endif + + static int TestPointer(const PROC pTest) + { + if(!pTest) + return 0; + + ptrdiff_t iTest = (ptrdiff_t) pTest; + + if (iTest == 1 || iTest == 2 || iTest == 3 || iTest == -1) + return 0; + + return 1; + } + + static PROC WinGetProcAddress(const char* name) + { + PROC pFunc = wglGetProcAddress((LPCSTR) name); + if (TestPointer(pFunc)) + return pFunc; + + HMODULE glMod = GetModuleHandleA("OpenGL32.dll"); + return (PROC) GetProcAddress(glMod, (LPCSTR) name); + } + #endif // _WIN32 + + #if defined(_WIN32) + #define CV_GL_GET_PROC_ADDRESS(name) WinGetProcAddress(name) + #elif defined(__APPLE__) + #define CV_GL_GET_PROC_ADDRESS(name) AppleGLGetProcAddress(name) + #elif defined(__sgi) || defined(__sun) + #define CV_GL_GET_PROC_ADDRESS(name) SunGetProcAddress(name) + #else // GLX + #include + + #define CV_GL_GET_PROC_ADDRESS(name) glXGetProcAddressARB((const GLubyte*) name) + #endif + + static void* IntGetProcAddress(const char* name) + { + void* func = (void*) CV_GL_GET_PROC_ADDRESS(name); + if (!func) + { + std::ostringstream msg; + msg << "Can't load OpenGL extension [" << name << "]"; + CV_Error(CV_OpenGlApiCallError, msg.str()); + } + return func; + } +#else + static void* IntGetProcAddress(const char*) + { + CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support"); + return 0; + } +#endif + +namespace gl +{ + ////////////////////////////////////////////// + // Function pointer types + + // Extension: 1.1 + typedef void (CODEGEN_FUNCPTR *PFNCULLFACEPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNFRONTFACEPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNHINTPROC)(GLenum , GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNLINEWIDTHPROC)(GLfloat ); + typedef void (CODEGEN_FUNCPTR *PFNPOINTSIZEPROC)(GLfloat ); + typedef void (CODEGEN_FUNCPTR *PFNPOLYGONMODEPROC)(GLenum , GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNSCISSORPROC)(GLint , GLint , GLsizei , GLsizei ); + typedef void (CODEGEN_FUNCPTR *PFNTEXPARAMETERFPROC)(GLenum , GLenum , GLfloat ); + typedef void (CODEGEN_FUNCPTR *PFNTEXPARAMETERFVPROC)(GLenum , GLenum , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNTEXPARAMETERIPROC)(GLenum , GLenum , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNTEXPARAMETERIVPROC)(GLenum , GLenum , const GLint *); + typedef void (CODEGEN_FUNCPTR *PFNTEXIMAGE1DPROC)(GLenum , GLint , GLint , GLsizei , GLint , GLenum , GLenum , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNTEXIMAGE2DPROC)(GLenum , GLint , GLint , GLsizei , GLsizei , GLint , GLenum , GLenum , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNDRAWBUFFERPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNCLEARPROC)(GLbitfield ); + typedef void (CODEGEN_FUNCPTR *PFNCLEARCOLORPROC)(GLfloat , GLfloat , GLfloat , GLfloat ); + typedef void (CODEGEN_FUNCPTR *PFNCLEARSTENCILPROC)(GLint ); + typedef void (CODEGEN_FUNCPTR *PFNCLEARDEPTHPROC)(GLdouble ); + typedef void (CODEGEN_FUNCPTR *PFNSTENCILMASKPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNCOLORMASKPROC)(GLboolean , GLboolean , GLboolean , GLboolean ); + typedef void (CODEGEN_FUNCPTR *PFNDEPTHMASKPROC)(GLboolean ); + typedef void (CODEGEN_FUNCPTR *PFNDISABLEPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNENABLEPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNFINISHPROC)(); + typedef void (CODEGEN_FUNCPTR *PFNFLUSHPROC)(); + typedef void (CODEGEN_FUNCPTR *PFNBLENDFUNCPROC)(GLenum , GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNLOGICOPPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNSTENCILFUNCPROC)(GLenum , GLint , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNSTENCILOPPROC)(GLenum , GLenum , GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNDEPTHFUNCPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNPIXELSTOREFPROC)(GLenum , GLfloat ); + typedef void (CODEGEN_FUNCPTR *PFNPIXELSTOREIPROC)(GLenum , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNREADBUFFERPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNREADPIXELSPROC)(GLint , GLint , GLsizei , GLsizei , GLenum , GLenum , GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNGETBOOLEANVPROC)(GLenum , GLboolean *); + typedef void (CODEGEN_FUNCPTR *PFNGETDOUBLEVPROC)(GLenum , GLdouble *); + typedef GLenum (CODEGEN_FUNCPTR *PFNGETERRORPROC)(); + typedef void (CODEGEN_FUNCPTR *PFNGETFLOATVPROC)(GLenum , GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNGETINTEGERVPROC)(GLenum , GLint *); + typedef const GLubyte * (CODEGEN_FUNCPTR *PFNGETSTRINGPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNGETTEXIMAGEPROC)(GLenum , GLint , GLenum , GLenum , GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNGETTEXPARAMETERFVPROC)(GLenum , GLenum , GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNGETTEXPARAMETERIVPROC)(GLenum , GLenum , GLint *); + typedef void (CODEGEN_FUNCPTR *PFNGETTEXLEVELPARAMETERFVPROC)(GLenum , GLint , GLenum , GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNGETTEXLEVELPARAMETERIVPROC)(GLenum , GLint , GLenum , GLint *); + typedef GLboolean (CODEGEN_FUNCPTR *PFNISENABLEDPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNDEPTHRANGEPROC)(GLdouble , GLdouble ); + typedef void (CODEGEN_FUNCPTR *PFNVIEWPORTPROC)(GLint , GLint , GLsizei , GLsizei ); + typedef void (CODEGEN_FUNCPTR *PFNDRAWARRAYSPROC)(GLenum , GLint , GLsizei ); + typedef void (CODEGEN_FUNCPTR *PFNDRAWELEMENTSPROC)(GLenum , GLsizei , GLenum , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNGETPOINTERVPROC)(GLenum , GLvoid* *); + typedef void (CODEGEN_FUNCPTR *PFNPOLYGONOFFSETPROC)(GLfloat , GLfloat ); + typedef void (CODEGEN_FUNCPTR *PFNCOPYTEXIMAGE1DPROC)(GLenum , GLint , GLenum , GLint , GLint , GLsizei , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNCOPYTEXIMAGE2DPROC)(GLenum , GLint , GLenum , GLint , GLint , GLsizei , GLsizei , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNCOPYTEXSUBIMAGE1DPROC)(GLenum , GLint , GLint , GLint , GLint , GLsizei ); + typedef void (CODEGEN_FUNCPTR *PFNCOPYTEXSUBIMAGE2DPROC)(GLenum , GLint , GLint , GLint , GLint , GLint , GLsizei , GLsizei ); + typedef void (CODEGEN_FUNCPTR *PFNTEXSUBIMAGE1DPROC)(GLenum , GLint , GLint , GLsizei , GLenum , GLenum , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNTEXSUBIMAGE2DPROC)(GLenum , GLint , GLint , GLint , GLsizei , GLsizei , GLenum , GLenum , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNBINDTEXTUREPROC)(GLenum , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNDELETETEXTURESPROC)(GLsizei , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNGENTEXTURESPROC)(GLsizei , GLuint *); + typedef GLboolean (CODEGEN_FUNCPTR *PFNISTEXTUREPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNINDEXUBPROC)(GLubyte ); + typedef void (CODEGEN_FUNCPTR *PFNINDEXUBVPROC)(const GLubyte *); + + // Extension: 1.2 + typedef void (CODEGEN_FUNCPTR *PFNBLENDCOLORPROC)(GLfloat , GLfloat , GLfloat , GLfloat ); + typedef void (CODEGEN_FUNCPTR *PFNBLENDEQUATIONPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNDRAWRANGEELEMENTSPROC)(GLenum , GLuint , GLuint , GLsizei , GLenum , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNTEXSUBIMAGE3DPROC)(GLenum , GLint , GLint , GLint , GLint , GLsizei , GLsizei , GLsizei , GLenum , GLenum , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNCOPYTEXSUBIMAGE3DPROC)(GLenum , GLint , GLint , GLint , GLint , GLint , GLint , GLsizei , GLsizei ); + + // Extension: 1.3 + typedef void (CODEGEN_FUNCPTR *PFNACTIVETEXTUREPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNSAMPLECOVERAGEPROC)(GLfloat , GLboolean ); + typedef void (CODEGEN_FUNCPTR *PFNCOMPRESSEDTEXIMAGE3DPROC)(GLenum , GLint , GLenum , GLsizei , GLsizei , GLsizei , GLint , GLsizei , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNCOMPRESSEDTEXIMAGE2DPROC)(GLenum , GLint , GLenum , GLsizei , GLsizei , GLint , GLsizei , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNCOMPRESSEDTEXIMAGE1DPROC)(GLenum , GLint , GLenum , GLsizei , GLint , GLsizei , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNCOMPRESSEDTEXSUBIMAGE3DPROC)(GLenum , GLint , GLint , GLint , GLint , GLsizei , GLsizei , GLsizei , GLenum , GLsizei , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNCOMPRESSEDTEXSUBIMAGE2DPROC)(GLenum , GLint , GLint , GLint , GLsizei , GLsizei , GLenum , GLsizei , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNCOMPRESSEDTEXSUBIMAGE1DPROC)(GLenum , GLint , GLint , GLsizei , GLenum , GLsizei , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNGETCOMPRESSEDTEXIMAGEPROC)(GLenum , GLint , GLvoid *); + + // Extension: 1.4 + typedef void (CODEGEN_FUNCPTR *PFNBLENDFUNCSEPARATEPROC)(GLenum , GLenum , GLenum , GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNMULTIDRAWARRAYSPROC)(GLenum , const GLint *, const GLsizei *, GLsizei ); + typedef void (CODEGEN_FUNCPTR *PFNMULTIDRAWELEMENTSPROC)(GLenum , const GLsizei *, GLenum , const GLvoid* const *, GLsizei ); + typedef void (CODEGEN_FUNCPTR *PFNPOINTPARAMETERFPROC)(GLenum , GLfloat ); + typedef void (CODEGEN_FUNCPTR *PFNPOINTPARAMETERFVPROC)(GLenum , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNPOINTPARAMETERIPROC)(GLenum , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNPOINTPARAMETERIVPROC)(GLenum , const GLint *); + + // Extension: 1.5 + typedef void (CODEGEN_FUNCPTR *PFNGENQUERIESPROC)(GLsizei , GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNDELETEQUERIESPROC)(GLsizei , const GLuint *); + typedef GLboolean (CODEGEN_FUNCPTR *PFNISQUERYPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNBEGINQUERYPROC)(GLenum , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNENDQUERYPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNGETQUERYIVPROC)(GLenum , GLenum , GLint *); + typedef void (CODEGEN_FUNCPTR *PFNGETQUERYOBJECTIVPROC)(GLuint , GLenum , GLint *); + typedef void (CODEGEN_FUNCPTR *PFNGETQUERYOBJECTUIVPROC)(GLuint , GLenum , GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNBINDBUFFERPROC)(GLenum , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNDELETEBUFFERSPROC)(GLsizei , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNGENBUFFERSPROC)(GLsizei , GLuint *); + typedef GLboolean (CODEGEN_FUNCPTR *PFNISBUFFERPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNBUFFERDATAPROC)(GLenum , GLsizeiptr , const GLvoid *, GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNBUFFERSUBDATAPROC)(GLenum , GLintptr , GLsizeiptr , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNGETBUFFERSUBDATAPROC)(GLenum , GLintptr , GLsizeiptr , GLvoid *); + typedef GLvoid* (CODEGEN_FUNCPTR *PFNMAPBUFFERPROC)(GLenum , GLenum ); + typedef GLboolean (CODEGEN_FUNCPTR *PFNUNMAPBUFFERPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNGETBUFFERPARAMETERIVPROC)(GLenum , GLenum , GLint *); + typedef void (CODEGEN_FUNCPTR *PFNGETBUFFERPOINTERVPROC)(GLenum , GLenum , GLvoid* *); + + // Extension: 2.0 + typedef void (CODEGEN_FUNCPTR *PFNBLENDEQUATIONSEPARATEPROC)(GLenum , GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNDRAWBUFFERSPROC)(GLsizei , const GLenum *); + typedef void (CODEGEN_FUNCPTR *PFNSTENCILOPSEPARATEPROC)(GLenum , GLenum , GLenum , GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNSTENCILFUNCSEPARATEPROC)(GLenum , GLenum , GLint , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNSTENCILMASKSEPARATEPROC)(GLenum , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNATTACHSHADERPROC)(GLuint , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNBINDATTRIBLOCATIONPROC)(GLuint , GLuint , const GLchar *); + typedef void (CODEGEN_FUNCPTR *PFNCOMPILESHADERPROC)(GLuint ); + typedef GLuint (CODEGEN_FUNCPTR *PFNCREATEPROGRAMPROC)(); + typedef GLuint (CODEGEN_FUNCPTR *PFNCREATESHADERPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNDELETEPROGRAMPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNDELETESHADERPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNDETACHSHADERPROC)(GLuint , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNDISABLEVERTEXATTRIBARRAYPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNENABLEVERTEXATTRIBARRAYPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNGETACTIVEATTRIBPROC)(GLuint , GLuint , GLsizei , GLsizei *, GLint *, GLenum *, GLchar *); + typedef void (CODEGEN_FUNCPTR *PFNGETACTIVEUNIFORMPROC)(GLuint , GLuint , GLsizei , GLsizei *, GLint *, GLenum *, GLchar *); + typedef void (CODEGEN_FUNCPTR *PFNGETATTACHEDSHADERSPROC)(GLuint , GLsizei , GLsizei *, GLuint *); + typedef GLint (CODEGEN_FUNCPTR *PFNGETATTRIBLOCATIONPROC)(GLuint , const GLchar *); + typedef void (CODEGEN_FUNCPTR *PFNGETPROGRAMIVPROC)(GLuint , GLenum , GLint *); + typedef void (CODEGEN_FUNCPTR *PFNGETPROGRAMINFOLOGPROC)(GLuint , GLsizei , GLsizei *, GLchar *); + typedef void (CODEGEN_FUNCPTR *PFNGETSHADERIVPROC)(GLuint , GLenum , GLint *); + typedef void (CODEGEN_FUNCPTR *PFNGETSHADERINFOLOGPROC)(GLuint , GLsizei , GLsizei *, GLchar *); + typedef void (CODEGEN_FUNCPTR *PFNGETSHADERSOURCEPROC)(GLuint , GLsizei , GLsizei *, GLchar *); + typedef GLint (CODEGEN_FUNCPTR *PFNGETUNIFORMLOCATIONPROC)(GLuint , const GLchar *); + typedef void (CODEGEN_FUNCPTR *PFNGETUNIFORMFVPROC)(GLuint , GLint , GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNGETUNIFORMIVPROC)(GLuint , GLint , GLint *); + typedef void (CODEGEN_FUNCPTR *PFNGETVERTEXATTRIBDVPROC)(GLuint , GLenum , GLdouble *); + typedef void (CODEGEN_FUNCPTR *PFNGETVERTEXATTRIBFVPROC)(GLuint , GLenum , GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNGETVERTEXATTRIBIVPROC)(GLuint , GLenum , GLint *); + typedef void (CODEGEN_FUNCPTR *PFNGETVERTEXATTRIBPOINTERVPROC)(GLuint , GLenum , GLvoid* *); + typedef GLboolean (CODEGEN_FUNCPTR *PFNISPROGRAMPROC)(GLuint ); + typedef GLboolean (CODEGEN_FUNCPTR *PFNISSHADERPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNLINKPROGRAMPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNSHADERSOURCEPROC)(GLuint , GLsizei , const GLchar* const *, const GLint *); + typedef void (CODEGEN_FUNCPTR *PFNUSEPROGRAMPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM1FPROC)(GLint , GLfloat ); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM2FPROC)(GLint , GLfloat , GLfloat ); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM3FPROC)(GLint , GLfloat , GLfloat , GLfloat ); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM4FPROC)(GLint , GLfloat , GLfloat , GLfloat , GLfloat ); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM1IPROC)(GLint , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM2IPROC)(GLint , GLint , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM3IPROC)(GLint , GLint , GLint , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM4IPROC)(GLint , GLint , GLint , GLint , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM1FVPROC)(GLint , GLsizei , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM2FVPROC)(GLint , GLsizei , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM3FVPROC)(GLint , GLsizei , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM4FVPROC)(GLint , GLsizei , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM1IVPROC)(GLint , GLsizei , const GLint *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM2IVPROC)(GLint , GLsizei , const GLint *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM3IVPROC)(GLint , GLsizei , const GLint *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM4IVPROC)(GLint , GLsizei , const GLint *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX2FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX3FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX4FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNVALIDATEPROGRAMPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBPOINTERPROC)(GLuint , GLint , GLenum , GLboolean , GLsizei , const GLvoid *); + + // Extension: 2.1 + typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX2X3FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX3X2FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX2X4FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX4X2FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX3X4FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX4X3FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *); + + // Extension: ARB_vertex_array_object + typedef void (CODEGEN_FUNCPTR *PFNBINDVERTEXARRAYPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNDELETEVERTEXARRAYSPROC)(GLsizei , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNGENVERTEXARRAYSPROC)(GLsizei , GLuint *); + typedef GLboolean (CODEGEN_FUNCPTR *PFNISVERTEXARRAYPROC)(GLuint ); + + // Extension: ARB_map_buffer_range + typedef GLvoid* (CODEGEN_FUNCPTR *PFNMAPBUFFERRANGEPROC)(GLenum , GLintptr , GLsizeiptr , GLbitfield ); + typedef void (CODEGEN_FUNCPTR *PFNFLUSHMAPPEDBUFFERRANGEPROC)(GLenum , GLintptr , GLsizeiptr ); + + // Extension: ARB_framebuffer_object + typedef GLboolean (CODEGEN_FUNCPTR *PFNISRENDERBUFFERPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNBINDRENDERBUFFERPROC)(GLenum , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNDELETERENDERBUFFERSPROC)(GLsizei , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNGENRENDERBUFFERSPROC)(GLsizei , GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNRENDERBUFFERSTORAGEPROC)(GLenum , GLenum , GLsizei , GLsizei ); + typedef void (CODEGEN_FUNCPTR *PFNGETRENDERBUFFERPARAMETERIVPROC)(GLenum , GLenum , GLint *); + typedef GLboolean (CODEGEN_FUNCPTR *PFNISFRAMEBUFFERPROC)(GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNBINDFRAMEBUFFERPROC)(GLenum , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNDELETEFRAMEBUFFERSPROC)(GLsizei , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNGENFRAMEBUFFERSPROC)(GLsizei , GLuint *); + typedef GLenum (CODEGEN_FUNCPTR *PFNCHECKFRAMEBUFFERSTATUSPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNFRAMEBUFFERTEXTURE1DPROC)(GLenum , GLenum , GLenum , GLuint , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNFRAMEBUFFERTEXTURE2DPROC)(GLenum , GLenum , GLenum , GLuint , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNFRAMEBUFFERTEXTURE3DPROC)(GLenum , GLenum , GLenum , GLuint , GLint , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNFRAMEBUFFERRENDERBUFFERPROC)(GLenum , GLenum , GLenum , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC)(GLenum , GLenum , GLenum , GLint *); + typedef void (CODEGEN_FUNCPTR *PFNGENERATEMIPMAPPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNBLITFRAMEBUFFERPROC)(GLint , GLint , GLint , GLint , GLint , GLint , GLint , GLint , GLbitfield , GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNRENDERBUFFERSTORAGEMULTISAMPLEPROC)(GLenum , GLsizei , GLenum , GLsizei , GLsizei ); + typedef void (CODEGEN_FUNCPTR *PFNFRAMEBUFFERTEXTURELAYERPROC)(GLenum , GLenum , GLuint , GLint , GLint ); + + // Extension: 3.0 + typedef void (CODEGEN_FUNCPTR *PFNCOLORMASKIPROC)(GLuint , GLboolean , GLboolean , GLboolean , GLboolean ); + typedef void (CODEGEN_FUNCPTR *PFNGETBOOLEANI_VPROC)(GLenum , GLuint , GLboolean *); + typedef void (CODEGEN_FUNCPTR *PFNGETINTEGERI_VPROC)(GLenum , GLuint , GLint *); + typedef void (CODEGEN_FUNCPTR *PFNENABLEIPROC)(GLenum , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNDISABLEIPROC)(GLenum , GLuint ); + typedef GLboolean (CODEGEN_FUNCPTR *PFNISENABLEDIPROC)(GLenum , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNBEGINTRANSFORMFEEDBACKPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNENDTRANSFORMFEEDBACKPROC)(); + typedef void (CODEGEN_FUNCPTR *PFNBINDBUFFERRANGEPROC)(GLenum , GLuint , GLuint , GLintptr , GLsizeiptr ); + typedef void (CODEGEN_FUNCPTR *PFNBINDBUFFERBASEPROC)(GLenum , GLuint , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNTRANSFORMFEEDBACKVARYINGSPROC)(GLuint , GLsizei , const GLchar* const *, GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNGETTRANSFORMFEEDBACKVARYINGPROC)(GLuint , GLuint , GLsizei , GLsizei *, GLsizei *, GLenum *, GLchar *); + typedef void (CODEGEN_FUNCPTR *PFNCLAMPCOLORPROC)(GLenum , GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNBEGINCONDITIONALRENDERPROC)(GLuint , GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNENDCONDITIONALRENDERPROC)(); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBIPOINTERPROC)(GLuint , GLint , GLenum , GLsizei , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNGETVERTEXATTRIBIIVPROC)(GLuint , GLenum , GLint *); + typedef void (CODEGEN_FUNCPTR *PFNGETVERTEXATTRIBIUIVPROC)(GLuint , GLenum , GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI1IPROC)(GLuint , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI2IPROC)(GLuint , GLint , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI3IPROC)(GLuint , GLint , GLint , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4IPROC)(GLuint , GLint , GLint , GLint , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI1UIPROC)(GLuint , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI2UIPROC)(GLuint , GLuint , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI3UIPROC)(GLuint , GLuint , GLuint , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4UIPROC)(GLuint , GLuint , GLuint , GLuint , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI1IVPROC)(GLuint , const GLint *); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI2IVPROC)(GLuint , const GLint *); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI3IVPROC)(GLuint , const GLint *); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4IVPROC)(GLuint , const GLint *); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI1UIVPROC)(GLuint , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI2UIVPROC)(GLuint , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI3UIVPROC)(GLuint , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4UIVPROC)(GLuint , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4BVPROC)(GLuint , const GLbyte *); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4SVPROC)(GLuint , const GLshort *); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4UBVPROC)(GLuint , const GLubyte *); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4USVPROC)(GLuint , const GLushort *); + typedef void (CODEGEN_FUNCPTR *PFNGETUNIFORMUIVPROC)(GLuint , GLint , GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNBINDFRAGDATALOCATIONPROC)(GLuint , GLuint , const GLchar *); + typedef GLint (CODEGEN_FUNCPTR *PFNGETFRAGDATALOCATIONPROC)(GLuint , const GLchar *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM1UIPROC)(GLint , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM2UIPROC)(GLint , GLuint , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM3UIPROC)(GLint , GLuint , GLuint , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM4UIPROC)(GLint , GLuint , GLuint , GLuint , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM1UIVPROC)(GLint , GLsizei , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM2UIVPROC)(GLint , GLsizei , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM3UIVPROC)(GLint , GLsizei , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORM4UIVPROC)(GLint , GLsizei , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNTEXPARAMETERIIVPROC)(GLenum , GLenum , const GLint *); + typedef void (CODEGEN_FUNCPTR *PFNTEXPARAMETERIUIVPROC)(GLenum , GLenum , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNGETTEXPARAMETERIIVPROC)(GLenum , GLenum , GLint *); + typedef void (CODEGEN_FUNCPTR *PFNGETTEXPARAMETERIUIVPROC)(GLenum , GLenum , GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNCLEARBUFFERIVPROC)(GLenum , GLint , const GLint *); + typedef void (CODEGEN_FUNCPTR *PFNCLEARBUFFERUIVPROC)(GLenum , GLint , const GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNCLEARBUFFERFVPROC)(GLenum , GLint , const GLfloat *); + typedef void (CODEGEN_FUNCPTR *PFNCLEARBUFFERFIPROC)(GLenum , GLint , GLfloat , GLint ); + typedef const GLubyte * (CODEGEN_FUNCPTR *PFNGETSTRINGIPROC)(GLenum , GLuint ); + + // Extension: ARB_uniform_buffer_object + typedef void (CODEGEN_FUNCPTR *PFNGETUNIFORMINDICESPROC)(GLuint , GLsizei , const GLchar* const *, GLuint *); + typedef void (CODEGEN_FUNCPTR *PFNGETACTIVEUNIFORMSIVPROC)(GLuint , GLsizei , const GLuint *, GLenum , GLint *); + typedef void (CODEGEN_FUNCPTR *PFNGETACTIVEUNIFORMNAMEPROC)(GLuint , GLuint , GLsizei , GLsizei *, GLchar *); + typedef GLuint (CODEGEN_FUNCPTR *PFNGETUNIFORMBLOCKINDEXPROC)(GLuint , const GLchar *); + typedef void (CODEGEN_FUNCPTR *PFNGETACTIVEUNIFORMBLOCKIVPROC)(GLuint , GLuint , GLenum , GLint *); + typedef void (CODEGEN_FUNCPTR *PFNGETACTIVEUNIFORMBLOCKNAMEPROC)(GLuint , GLuint , GLsizei , GLsizei *, GLchar *); + typedef void (CODEGEN_FUNCPTR *PFNUNIFORMBLOCKBINDINGPROC)(GLuint , GLuint , GLuint ); + + // Extension: ARB_copy_buffer + typedef void (CODEGEN_FUNCPTR *PFNCOPYBUFFERSUBDATAPROC)(GLenum , GLenum , GLintptr , GLintptr , GLsizeiptr ); + + // Extension: 3.1 + typedef void (CODEGEN_FUNCPTR *PFNDRAWARRAYSINSTANCEDPROC)(GLenum , GLint , GLsizei , GLsizei ); + typedef void (CODEGEN_FUNCPTR *PFNDRAWELEMENTSINSTANCEDPROC)(GLenum , GLsizei , GLenum , const GLvoid *, GLsizei ); + typedef void (CODEGEN_FUNCPTR *PFNTEXBUFFERPROC)(GLenum , GLenum , GLuint ); + typedef void (CODEGEN_FUNCPTR *PFNPRIMITIVERESTARTINDEXPROC)(GLuint ); + + // Legacy + typedef void (CODEGEN_FUNCPTR *PFNENABLECLIENTSTATEPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNDISABLECLIENTSTATEPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNVERTEXPOINTERPROC)(GLint , GLenum , GLsizei , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNNORMALPOINTERPROC)(GLenum , GLsizei , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNCOLORPOINTERPROC)(GLint , GLenum , GLsizei , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNTEXCOORDPOINTERPROC)(GLint , GLenum , GLsizei , const GLvoid *); + typedef void (CODEGEN_FUNCPTR *PFNTEXENVIPROC)(GLenum , GLenum , GLint ); + typedef void (CODEGEN_FUNCPTR *PFNMATRIXMODEPROC)(GLenum ); + typedef void (CODEGEN_FUNCPTR *PFNLOADIDENTITYPROC)(void); + typedef void (CODEGEN_FUNCPTR *PFNORTHOPROC)(GLdouble , GLdouble , GLdouble , GLdouble , GLdouble , GLdouble ); + typedef void (CODEGEN_FUNCPTR *PFNCOLOR3DPROC)(GLdouble , GLdouble , GLdouble ); + + ////////////////////////////////////////////// + // Function pointers + + // Extension: 1.1 + PFNCULLFACEPROC CullFace; + PFNFRONTFACEPROC FrontFace; + PFNHINTPROC Hint; + PFNLINEWIDTHPROC LineWidth; + PFNPOINTSIZEPROC PointSize; + PFNPOLYGONMODEPROC PolygonMode; + PFNSCISSORPROC Scissor; + PFNTEXPARAMETERFPROC TexParameterf; + PFNTEXPARAMETERFVPROC TexParameterfv; + PFNTEXPARAMETERIPROC TexParameteri; + PFNTEXPARAMETERIVPROC TexParameteriv; + PFNTEXIMAGE1DPROC TexImage1D; + PFNTEXIMAGE2DPROC TexImage2D; + PFNDRAWBUFFERPROC DrawBuffer; + PFNCLEARPROC Clear; + PFNCLEARCOLORPROC ClearColor; + PFNCLEARSTENCILPROC ClearStencil; + PFNCLEARDEPTHPROC ClearDepth; + PFNSTENCILMASKPROC StencilMask; + PFNCOLORMASKPROC ColorMask; + PFNDEPTHMASKPROC DepthMask; + PFNDISABLEPROC Disable; + PFNENABLEPROC Enable; + PFNFINISHPROC Finish; + PFNFLUSHPROC Flush; + PFNBLENDFUNCPROC BlendFunc; + PFNLOGICOPPROC LogicOp; + PFNSTENCILFUNCPROC StencilFunc; + PFNSTENCILOPPROC StencilOp; + PFNDEPTHFUNCPROC DepthFunc; + PFNPIXELSTOREFPROC PixelStoref; + PFNPIXELSTOREIPROC PixelStorei; + PFNREADBUFFERPROC ReadBuffer; + PFNREADPIXELSPROC ReadPixels; + PFNGETBOOLEANVPROC GetBooleanv; + PFNGETDOUBLEVPROC GetDoublev; + PFNGETERRORPROC GetError; + PFNGETFLOATVPROC GetFloatv; + PFNGETINTEGERVPROC GetIntegerv; + PFNGETSTRINGPROC GetString; + PFNGETTEXIMAGEPROC GetTexImage; + PFNGETTEXPARAMETERFVPROC GetTexParameterfv; + PFNGETTEXPARAMETERIVPROC GetTexParameteriv; + PFNGETTEXLEVELPARAMETERFVPROC GetTexLevelParameterfv; + PFNGETTEXLEVELPARAMETERIVPROC GetTexLevelParameteriv; + PFNISENABLEDPROC IsEnabled; + PFNDEPTHRANGEPROC DepthRange; + PFNVIEWPORTPROC Viewport; + PFNDRAWARRAYSPROC DrawArrays; + PFNDRAWELEMENTSPROC DrawElements; + PFNGETPOINTERVPROC GetPointerv; + PFNPOLYGONOFFSETPROC PolygonOffset; + PFNCOPYTEXIMAGE1DPROC CopyTexImage1D; + PFNCOPYTEXIMAGE2DPROC CopyTexImage2D; + PFNCOPYTEXSUBIMAGE1DPROC CopyTexSubImage1D; + PFNCOPYTEXSUBIMAGE2DPROC CopyTexSubImage2D; + PFNTEXSUBIMAGE1DPROC TexSubImage1D; + PFNTEXSUBIMAGE2DPROC TexSubImage2D; + PFNBINDTEXTUREPROC BindTexture; + PFNDELETETEXTURESPROC DeleteTextures; + PFNGENTEXTURESPROC GenTextures; + PFNISTEXTUREPROC IsTexture; + PFNINDEXUBPROC Indexub; + PFNINDEXUBVPROC Indexubv; + + // Extension: 1.2 + PFNBLENDCOLORPROC BlendColor; + PFNBLENDEQUATIONPROC BlendEquation; + PFNDRAWRANGEELEMENTSPROC DrawRangeElements; + PFNTEXSUBIMAGE3DPROC TexSubImage3D; + PFNCOPYTEXSUBIMAGE3DPROC CopyTexSubImage3D; + + // Extension: 1.3 + PFNACTIVETEXTUREPROC ActiveTexture; + PFNSAMPLECOVERAGEPROC SampleCoverage; + PFNCOMPRESSEDTEXIMAGE3DPROC CompressedTexImage3D; + PFNCOMPRESSEDTEXIMAGE2DPROC CompressedTexImage2D; + PFNCOMPRESSEDTEXIMAGE1DPROC CompressedTexImage1D; + PFNCOMPRESSEDTEXSUBIMAGE3DPROC CompressedTexSubImage3D; + PFNCOMPRESSEDTEXSUBIMAGE2DPROC CompressedTexSubImage2D; + PFNCOMPRESSEDTEXSUBIMAGE1DPROC CompressedTexSubImage1D; + PFNGETCOMPRESSEDTEXIMAGEPROC GetCompressedTexImage; + + // Extension: 1.4 + PFNBLENDFUNCSEPARATEPROC BlendFuncSeparate; + PFNMULTIDRAWARRAYSPROC MultiDrawArrays; + PFNMULTIDRAWELEMENTSPROC MultiDrawElements; + PFNPOINTPARAMETERFPROC PointParameterf; + PFNPOINTPARAMETERFVPROC PointParameterfv; + PFNPOINTPARAMETERIPROC PointParameteri; + PFNPOINTPARAMETERIVPROC PointParameteriv; + + // Extension: 1.5 + PFNGENQUERIESPROC GenQueries; + PFNDELETEQUERIESPROC DeleteQueries; + PFNISQUERYPROC IsQuery; + PFNBEGINQUERYPROC BeginQuery; + PFNENDQUERYPROC EndQuery; + PFNGETQUERYIVPROC GetQueryiv; + PFNGETQUERYOBJECTIVPROC GetQueryObjectiv; + PFNGETQUERYOBJECTUIVPROC GetQueryObjectuiv; + PFNBINDBUFFERPROC BindBuffer; + PFNDELETEBUFFERSPROC DeleteBuffers; + PFNGENBUFFERSPROC GenBuffers; + PFNISBUFFERPROC IsBuffer; + PFNBUFFERDATAPROC BufferData; + PFNBUFFERSUBDATAPROC BufferSubData; + PFNGETBUFFERSUBDATAPROC GetBufferSubData; + PFNMAPBUFFERPROC MapBuffer; + PFNUNMAPBUFFERPROC UnmapBuffer; + PFNGETBUFFERPARAMETERIVPROC GetBufferParameteriv; + PFNGETBUFFERPOINTERVPROC GetBufferPointerv; + + // Extension: 2.0 + PFNBLENDEQUATIONSEPARATEPROC BlendEquationSeparate; + PFNDRAWBUFFERSPROC DrawBuffers; + PFNSTENCILOPSEPARATEPROC StencilOpSeparate; + PFNSTENCILFUNCSEPARATEPROC StencilFuncSeparate; + PFNSTENCILMASKSEPARATEPROC StencilMaskSeparate; + PFNATTACHSHADERPROC AttachShader; + PFNBINDATTRIBLOCATIONPROC BindAttribLocation; + PFNCOMPILESHADERPROC CompileShader; + PFNCREATEPROGRAMPROC CreateProgram; + PFNCREATESHADERPROC CreateShader; + PFNDELETEPROGRAMPROC DeleteProgram; + PFNDELETESHADERPROC DeleteShader; + PFNDETACHSHADERPROC DetachShader; + PFNDISABLEVERTEXATTRIBARRAYPROC DisableVertexAttribArray; + PFNENABLEVERTEXATTRIBARRAYPROC EnableVertexAttribArray; + PFNGETACTIVEATTRIBPROC GetActiveAttrib; + PFNGETACTIVEUNIFORMPROC GetActiveUniform; + PFNGETATTACHEDSHADERSPROC GetAttachedShaders; + PFNGETATTRIBLOCATIONPROC GetAttribLocation; + PFNGETPROGRAMIVPROC GetProgramiv; + PFNGETPROGRAMINFOLOGPROC GetProgramInfoLog; + PFNGETSHADERIVPROC GetShaderiv; + PFNGETSHADERINFOLOGPROC GetShaderInfoLog; + PFNGETSHADERSOURCEPROC GetShaderSource; + PFNGETUNIFORMLOCATIONPROC GetUniformLocation; + PFNGETUNIFORMFVPROC GetUniformfv; + PFNGETUNIFORMIVPROC GetUniformiv; + PFNGETVERTEXATTRIBDVPROC GetVertexAttribdv; + PFNGETVERTEXATTRIBFVPROC GetVertexAttribfv; + PFNGETVERTEXATTRIBIVPROC GetVertexAttribiv; + PFNGETVERTEXATTRIBPOINTERVPROC GetVertexAttribPointerv; + PFNISPROGRAMPROC IsProgram; + PFNISSHADERPROC IsShader; + PFNLINKPROGRAMPROC LinkProgram; + PFNSHADERSOURCEPROC ShaderSource; + PFNUSEPROGRAMPROC UseProgram; + PFNUNIFORM1FPROC Uniform1f; + PFNUNIFORM2FPROC Uniform2f; + PFNUNIFORM3FPROC Uniform3f; + PFNUNIFORM4FPROC Uniform4f; + PFNUNIFORM1IPROC Uniform1i; + PFNUNIFORM2IPROC Uniform2i; + PFNUNIFORM3IPROC Uniform3i; + PFNUNIFORM4IPROC Uniform4i; + PFNUNIFORM1FVPROC Uniform1fv; + PFNUNIFORM2FVPROC Uniform2fv; + PFNUNIFORM3FVPROC Uniform3fv; + PFNUNIFORM4FVPROC Uniform4fv; + PFNUNIFORM1IVPROC Uniform1iv; + PFNUNIFORM2IVPROC Uniform2iv; + PFNUNIFORM3IVPROC Uniform3iv; + PFNUNIFORM4IVPROC Uniform4iv; + PFNUNIFORMMATRIX2FVPROC UniformMatrix2fv; + PFNUNIFORMMATRIX3FVPROC UniformMatrix3fv; + PFNUNIFORMMATRIX4FVPROC UniformMatrix4fv; + PFNVALIDATEPROGRAMPROC ValidateProgram; + PFNVERTEXATTRIBPOINTERPROC VertexAttribPointer; + + // Extension: 2.1 + PFNUNIFORMMATRIX2X3FVPROC UniformMatrix2x3fv; + PFNUNIFORMMATRIX3X2FVPROC UniformMatrix3x2fv; + PFNUNIFORMMATRIX2X4FVPROC UniformMatrix2x4fv; + PFNUNIFORMMATRIX4X2FVPROC UniformMatrix4x2fv; + PFNUNIFORMMATRIX3X4FVPROC UniformMatrix3x4fv; + PFNUNIFORMMATRIX4X3FVPROC UniformMatrix4x3fv; + + // Extension: ARB_vertex_array_object + PFNBINDVERTEXARRAYPROC BindVertexArray; + PFNDELETEVERTEXARRAYSPROC DeleteVertexArrays; + PFNGENVERTEXARRAYSPROC GenVertexArrays; + PFNISVERTEXARRAYPROC IsVertexArray; + + // Extension: ARB_map_buffer_range + PFNMAPBUFFERRANGEPROC MapBufferRange; + PFNFLUSHMAPPEDBUFFERRANGEPROC FlushMappedBufferRange; + + // Extension: ARB_framebuffer_object + PFNISRENDERBUFFERPROC IsRenderbuffer; + PFNBINDRENDERBUFFERPROC BindRenderbuffer; + PFNDELETERENDERBUFFERSPROC DeleteRenderbuffers; + PFNGENRENDERBUFFERSPROC GenRenderbuffers; + PFNRENDERBUFFERSTORAGEPROC RenderbufferStorage; + PFNGETRENDERBUFFERPARAMETERIVPROC GetRenderbufferParameteriv; + PFNISFRAMEBUFFERPROC IsFramebuffer; + PFNBINDFRAMEBUFFERPROC BindFramebuffer; + PFNDELETEFRAMEBUFFERSPROC DeleteFramebuffers; + PFNGENFRAMEBUFFERSPROC GenFramebuffers; + PFNCHECKFRAMEBUFFERSTATUSPROC CheckFramebufferStatus; + PFNFRAMEBUFFERTEXTURE1DPROC FramebufferTexture1D; + PFNFRAMEBUFFERTEXTURE2DPROC FramebufferTexture2D; + PFNFRAMEBUFFERTEXTURE3DPROC FramebufferTexture3D; + PFNFRAMEBUFFERRENDERBUFFERPROC FramebufferRenderbuffer; + PFNGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC GetFramebufferAttachmentParameteriv; + PFNGENERATEMIPMAPPROC GenerateMipmap; + PFNBLITFRAMEBUFFERPROC BlitFramebuffer; + PFNRENDERBUFFERSTORAGEMULTISAMPLEPROC RenderbufferStorageMultisample; + PFNFRAMEBUFFERTEXTURELAYERPROC FramebufferTextureLayer; + + // Extension: 3.0 + PFNCOLORMASKIPROC ColorMaski; + PFNGETBOOLEANI_VPROC GetBooleani_v; + PFNGETINTEGERI_VPROC GetIntegeri_v; + PFNENABLEIPROC Enablei; + PFNDISABLEIPROC Disablei; + PFNISENABLEDIPROC IsEnabledi; + PFNBEGINTRANSFORMFEEDBACKPROC BeginTransformFeedback; + PFNENDTRANSFORMFEEDBACKPROC EndTransformFeedback; + PFNBINDBUFFERRANGEPROC BindBufferRange; + PFNBINDBUFFERBASEPROC BindBufferBase; + PFNTRANSFORMFEEDBACKVARYINGSPROC TransformFeedbackVaryings; + PFNGETTRANSFORMFEEDBACKVARYINGPROC GetTransformFeedbackVarying; + PFNCLAMPCOLORPROC ClampColor; + PFNBEGINCONDITIONALRENDERPROC BeginConditionalRender; + PFNENDCONDITIONALRENDERPROC EndConditionalRender; + PFNVERTEXATTRIBIPOINTERPROC VertexAttribIPointer; + PFNGETVERTEXATTRIBIIVPROC GetVertexAttribIiv; + PFNGETVERTEXATTRIBIUIVPROC GetVertexAttribIuiv; + PFNVERTEXATTRIBI1IPROC VertexAttribI1i; + PFNVERTEXATTRIBI2IPROC VertexAttribI2i; + PFNVERTEXATTRIBI3IPROC VertexAttribI3i; + PFNVERTEXATTRIBI4IPROC VertexAttribI4i; + PFNVERTEXATTRIBI1UIPROC VertexAttribI1ui; + PFNVERTEXATTRIBI2UIPROC VertexAttribI2ui; + PFNVERTEXATTRIBI3UIPROC VertexAttribI3ui; + PFNVERTEXATTRIBI4UIPROC VertexAttribI4ui; + PFNVERTEXATTRIBI1IVPROC VertexAttribI1iv; + PFNVERTEXATTRIBI2IVPROC VertexAttribI2iv; + PFNVERTEXATTRIBI3IVPROC VertexAttribI3iv; + PFNVERTEXATTRIBI4IVPROC VertexAttribI4iv; + PFNVERTEXATTRIBI1UIVPROC VertexAttribI1uiv; + PFNVERTEXATTRIBI2UIVPROC VertexAttribI2uiv; + PFNVERTEXATTRIBI3UIVPROC VertexAttribI3uiv; + PFNVERTEXATTRIBI4UIVPROC VertexAttribI4uiv; + PFNVERTEXATTRIBI4BVPROC VertexAttribI4bv; + PFNVERTEXATTRIBI4SVPROC VertexAttribI4sv; + PFNVERTEXATTRIBI4UBVPROC VertexAttribI4ubv; + PFNVERTEXATTRIBI4USVPROC VertexAttribI4usv; + PFNGETUNIFORMUIVPROC GetUniformuiv; + PFNBINDFRAGDATALOCATIONPROC BindFragDataLocation; + PFNGETFRAGDATALOCATIONPROC GetFragDataLocation; + PFNUNIFORM1UIPROC Uniform1ui; + PFNUNIFORM2UIPROC Uniform2ui; + PFNUNIFORM3UIPROC Uniform3ui; + PFNUNIFORM4UIPROC Uniform4ui; + PFNUNIFORM1UIVPROC Uniform1uiv; + PFNUNIFORM2UIVPROC Uniform2uiv; + PFNUNIFORM3UIVPROC Uniform3uiv; + PFNUNIFORM4UIVPROC Uniform4uiv; + PFNTEXPARAMETERIIVPROC TexParameterIiv; + PFNTEXPARAMETERIUIVPROC TexParameterIuiv; + PFNGETTEXPARAMETERIIVPROC GetTexParameterIiv; + PFNGETTEXPARAMETERIUIVPROC GetTexParameterIuiv; + PFNCLEARBUFFERIVPROC ClearBufferiv; + PFNCLEARBUFFERUIVPROC ClearBufferuiv; + PFNCLEARBUFFERFVPROC ClearBufferfv; + PFNCLEARBUFFERFIPROC ClearBufferfi; + PFNGETSTRINGIPROC GetStringi; + + // Extension: ARB_uniform_buffer_object + PFNGETUNIFORMINDICESPROC GetUniformIndices; + PFNGETACTIVEUNIFORMSIVPROC GetActiveUniformsiv; + PFNGETACTIVEUNIFORMNAMEPROC GetActiveUniformName; + PFNGETUNIFORMBLOCKINDEXPROC GetUniformBlockIndex; + PFNGETACTIVEUNIFORMBLOCKIVPROC GetActiveUniformBlockiv; + PFNGETACTIVEUNIFORMBLOCKNAMEPROC GetActiveUniformBlockName; + PFNUNIFORMBLOCKBINDINGPROC UniformBlockBinding; + + // Extension: ARB_copy_buffer + PFNCOPYBUFFERSUBDATAPROC CopyBufferSubData; + + // Extension: 3.1 + PFNDRAWARRAYSINSTANCEDPROC DrawArraysInstanced; + PFNDRAWELEMENTSINSTANCEDPROC DrawElementsInstanced; + PFNTEXBUFFERPROC TexBuffer; + PFNPRIMITIVERESTARTINDEXPROC PrimitiveRestartIndex; + + // Legacy + PFNENABLECLIENTSTATEPROC EnableClientState; + PFNDISABLECLIENTSTATEPROC DisableClientState; + PFNVERTEXPOINTERPROC VertexPointer; + PFNNORMALPOINTERPROC NormalPointer; + PFNCOLORPOINTERPROC ColorPointer; + PFNTEXCOORDPOINTERPROC TexCoordPointer; + + PFNTEXENVIPROC TexEnvi; + + PFNMATRIXMODEPROC MatrixMode; + PFNLOADIDENTITYPROC LoadIdentity; + PFNORTHOPROC Ortho; + + PFNCOLOR3DPROC Color3d; + + ////////////////////////////////////////////// + // Switch functions + + // Extension: 1.1 + + static void CODEGEN_FUNCPTR Switch_CullFace(GLenum mode) + { + CullFace = (PFNCULLFACEPROC)IntGetProcAddress("glCullFace"); + CullFace(mode); + } + + static void CODEGEN_FUNCPTR Switch_FrontFace(GLenum mode) + { + FrontFace = (PFNFRONTFACEPROC)IntGetProcAddress("glFrontFace"); + FrontFace(mode); + } + + static void CODEGEN_FUNCPTR Switch_Hint(GLenum target, GLenum mode) + { + Hint = (PFNHINTPROC)IntGetProcAddress("glHint"); + Hint(target, mode); + } + + static void CODEGEN_FUNCPTR Switch_LineWidth(GLfloat width) + { + LineWidth = (PFNLINEWIDTHPROC)IntGetProcAddress("glLineWidth"); + LineWidth(width); + } + + static void CODEGEN_FUNCPTR Switch_PointSize(GLfloat size) + { + PointSize = (PFNPOINTSIZEPROC)IntGetProcAddress("glPointSize"); + PointSize(size); + } + + static void CODEGEN_FUNCPTR Switch_PolygonMode(GLenum face, GLenum mode) + { + PolygonMode = (PFNPOLYGONMODEPROC)IntGetProcAddress("glPolygonMode"); + PolygonMode(face, mode); + } + + static void CODEGEN_FUNCPTR Switch_Scissor(GLint x, GLint y, GLsizei width, GLsizei height) + { + Scissor = (PFNSCISSORPROC)IntGetProcAddress("glScissor"); + Scissor(x, y, width, height); + } + + static void CODEGEN_FUNCPTR Switch_TexParameterf(GLenum target, GLenum pname, GLfloat param) + { + TexParameterf = (PFNTEXPARAMETERFPROC)IntGetProcAddress("glTexParameterf"); + TexParameterf(target, pname, param); + } + + static void CODEGEN_FUNCPTR Switch_TexParameterfv(GLenum target, GLenum pname, const GLfloat *params) + { + TexParameterfv = (PFNTEXPARAMETERFVPROC)IntGetProcAddress("glTexParameterfv"); + TexParameterfv(target, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_TexParameteri(GLenum target, GLenum pname, GLint param) + { + TexParameteri = (PFNTEXPARAMETERIPROC)IntGetProcAddress("glTexParameteri"); + TexParameteri(target, pname, param); + } + + static void CODEGEN_FUNCPTR Switch_TexParameteriv(GLenum target, GLenum pname, const GLint *params) + { + TexParameteriv = (PFNTEXPARAMETERIVPROC)IntGetProcAddress("glTexParameteriv"); + TexParameteriv(target, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_TexImage1D(GLenum target, GLint level, GLint internalformat, GLsizei width, GLint border, GLenum format, GLenum type, const GLvoid *pixels) + { + TexImage1D = (PFNTEXIMAGE1DPROC)IntGetProcAddress("glTexImage1D"); + TexImage1D(target, level, internalformat, width, border, format, type, pixels); + } + + static void CODEGEN_FUNCPTR Switch_TexImage2D(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const GLvoid *pixels) + { + TexImage2D = (PFNTEXIMAGE2DPROC)IntGetProcAddress("glTexImage2D"); + TexImage2D(target, level, internalformat, width, height, border, format, type, pixels); + } + + static void CODEGEN_FUNCPTR Switch_DrawBuffer(GLenum mode) + { + DrawBuffer = (PFNDRAWBUFFERPROC)IntGetProcAddress("glDrawBuffer"); + DrawBuffer(mode); + } + + static void CODEGEN_FUNCPTR Switch_Clear(GLbitfield mask) + { + Clear = (PFNCLEARPROC)IntGetProcAddress("glClear"); + Clear(mask); + } + + static void CODEGEN_FUNCPTR Switch_ClearColor(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha) + { + ClearColor = (PFNCLEARCOLORPROC)IntGetProcAddress("glClearColor"); + ClearColor(red, green, blue, alpha); + } + + static void CODEGEN_FUNCPTR Switch_ClearStencil(GLint s) + { + ClearStencil = (PFNCLEARSTENCILPROC)IntGetProcAddress("glClearStencil"); + ClearStencil(s); + } + + static void CODEGEN_FUNCPTR Switch_ClearDepth(GLdouble depth) + { + ClearDepth = (PFNCLEARDEPTHPROC)IntGetProcAddress("glClearDepth"); + ClearDepth(depth); + } + + static void CODEGEN_FUNCPTR Switch_StencilMask(GLuint mask) + { + StencilMask = (PFNSTENCILMASKPROC)IntGetProcAddress("glStencilMask"); + StencilMask(mask); + } + + static void CODEGEN_FUNCPTR Switch_ColorMask(GLboolean red, GLboolean green, GLboolean blue, GLboolean alpha) + { + ColorMask = (PFNCOLORMASKPROC)IntGetProcAddress("glColorMask"); + ColorMask(red, green, blue, alpha); + } + + static void CODEGEN_FUNCPTR Switch_DepthMask(GLboolean flag) + { + DepthMask = (PFNDEPTHMASKPROC)IntGetProcAddress("glDepthMask"); + DepthMask(flag); + } + + static void CODEGEN_FUNCPTR Switch_Disable(GLenum cap) + { + Disable = (PFNDISABLEPROC)IntGetProcAddress("glDisable"); + Disable(cap); + } + + static void CODEGEN_FUNCPTR Switch_Enable(GLenum cap) + { + Enable = (PFNENABLEPROC)IntGetProcAddress("glEnable"); + Enable(cap); + } + + static void CODEGEN_FUNCPTR Switch_Finish() + { + Finish = (PFNFINISHPROC)IntGetProcAddress("glFinish"); + Finish(); + } + + static void CODEGEN_FUNCPTR Switch_Flush() + { + Flush = (PFNFLUSHPROC)IntGetProcAddress("glFlush"); + Flush(); + } + + static void CODEGEN_FUNCPTR Switch_BlendFunc(GLenum sfactor, GLenum dfactor) + { + BlendFunc = (PFNBLENDFUNCPROC)IntGetProcAddress("glBlendFunc"); + BlendFunc(sfactor, dfactor); + } + + static void CODEGEN_FUNCPTR Switch_LogicOp(GLenum opcode) + { + LogicOp = (PFNLOGICOPPROC)IntGetProcAddress("glLogicOp"); + LogicOp(opcode); + } + + static void CODEGEN_FUNCPTR Switch_StencilFunc(GLenum func, GLint ref, GLuint mask) + { + StencilFunc = (PFNSTENCILFUNCPROC)IntGetProcAddress("glStencilFunc"); + StencilFunc(func, ref, mask); + } + + static void CODEGEN_FUNCPTR Switch_StencilOp(GLenum fail, GLenum zfail, GLenum zpass) + { + StencilOp = (PFNSTENCILOPPROC)IntGetProcAddress("glStencilOp"); + StencilOp(fail, zfail, zpass); + } + + static void CODEGEN_FUNCPTR Switch_DepthFunc(GLenum func) + { + DepthFunc = (PFNDEPTHFUNCPROC)IntGetProcAddress("glDepthFunc"); + DepthFunc(func); + } + + static void CODEGEN_FUNCPTR Switch_PixelStoref(GLenum pname, GLfloat param) + { + PixelStoref = (PFNPIXELSTOREFPROC)IntGetProcAddress("glPixelStoref"); + PixelStoref(pname, param); + } + + static void CODEGEN_FUNCPTR Switch_PixelStorei(GLenum pname, GLint param) + { + PixelStorei = (PFNPIXELSTOREIPROC)IntGetProcAddress("glPixelStorei"); + PixelStorei(pname, param); + } + + static void CODEGEN_FUNCPTR Switch_ReadBuffer(GLenum mode) + { + ReadBuffer = (PFNREADBUFFERPROC)IntGetProcAddress("glReadBuffer"); + ReadBuffer(mode); + } + + static void CODEGEN_FUNCPTR Switch_ReadPixels(GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, GLvoid *pixels) + { + ReadPixels = (PFNREADPIXELSPROC)IntGetProcAddress("glReadPixels"); + ReadPixels(x, y, width, height, format, type, pixels); + } + + static void CODEGEN_FUNCPTR Switch_GetBooleanv(GLenum pname, GLboolean *params) + { + GetBooleanv = (PFNGETBOOLEANVPROC)IntGetProcAddress("glGetBooleanv"); + GetBooleanv(pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetDoublev(GLenum pname, GLdouble *params) + { + GetDoublev = (PFNGETDOUBLEVPROC)IntGetProcAddress("glGetDoublev"); + GetDoublev(pname, params); + } + + static GLenum CODEGEN_FUNCPTR Switch_GetError() + { + GetError = (PFNGETERRORPROC)IntGetProcAddress("glGetError"); + return GetError(); + } + + static void CODEGEN_FUNCPTR Switch_GetFloatv(GLenum pname, GLfloat *params) + { + GetFloatv = (PFNGETFLOATVPROC)IntGetProcAddress("glGetFloatv"); + GetFloatv(pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetIntegerv(GLenum pname, GLint *params) + { + GetIntegerv = (PFNGETINTEGERVPROC)IntGetProcAddress("glGetIntegerv"); + GetIntegerv(pname, params); + } + + static const GLubyte * CODEGEN_FUNCPTR Switch_GetString(GLenum name) + { + GetString = (PFNGETSTRINGPROC)IntGetProcAddress("glGetString"); + return GetString(name); + } + + static void CODEGEN_FUNCPTR Switch_GetTexImage(GLenum target, GLint level, GLenum format, GLenum type, GLvoid *pixels) + { + GetTexImage = (PFNGETTEXIMAGEPROC)IntGetProcAddress("glGetTexImage"); + GetTexImage(target, level, format, type, pixels); + } + + static void CODEGEN_FUNCPTR Switch_GetTexParameterfv(GLenum target, GLenum pname, GLfloat *params) + { + GetTexParameterfv = (PFNGETTEXPARAMETERFVPROC)IntGetProcAddress("glGetTexParameterfv"); + GetTexParameterfv(target, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetTexParameteriv(GLenum target, GLenum pname, GLint *params) + { + GetTexParameteriv = (PFNGETTEXPARAMETERIVPROC)IntGetProcAddress("glGetTexParameteriv"); + GetTexParameteriv(target, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetTexLevelParameterfv(GLenum target, GLint level, GLenum pname, GLfloat *params) + { + GetTexLevelParameterfv = (PFNGETTEXLEVELPARAMETERFVPROC)IntGetProcAddress("glGetTexLevelParameterfv"); + GetTexLevelParameterfv(target, level, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetTexLevelParameteriv(GLenum target, GLint level, GLenum pname, GLint *params) + { + GetTexLevelParameteriv = (PFNGETTEXLEVELPARAMETERIVPROC)IntGetProcAddress("glGetTexLevelParameteriv"); + GetTexLevelParameteriv(target, level, pname, params); + } + + static GLboolean CODEGEN_FUNCPTR Switch_IsEnabled(GLenum cap) + { + IsEnabled = (PFNISENABLEDPROC)IntGetProcAddress("glIsEnabled"); + return IsEnabled(cap); + } + + static void CODEGEN_FUNCPTR Switch_DepthRange(GLdouble ren_near, GLdouble ren_far) + { + DepthRange = (PFNDEPTHRANGEPROC)IntGetProcAddress("glDepthRange"); + DepthRange(ren_near, ren_far); + } + + static void CODEGEN_FUNCPTR Switch_Viewport(GLint x, GLint y, GLsizei width, GLsizei height) + { + Viewport = (PFNVIEWPORTPROC)IntGetProcAddress("glViewport"); + Viewport(x, y, width, height); + } + + static void CODEGEN_FUNCPTR Switch_DrawArrays(GLenum mode, GLint first, GLsizei count) + { + DrawArrays = (PFNDRAWARRAYSPROC)IntGetProcAddress("glDrawArrays"); + DrawArrays(mode, first, count); + } + + static void CODEGEN_FUNCPTR Switch_DrawElements(GLenum mode, GLsizei count, GLenum type, const GLvoid *indices) + { + DrawElements = (PFNDRAWELEMENTSPROC)IntGetProcAddress("glDrawElements"); + DrawElements(mode, count, type, indices); + } + + static void CODEGEN_FUNCPTR Switch_GetPointerv(GLenum pname, GLvoid* *params) + { + GetPointerv = (PFNGETPOINTERVPROC)IntGetProcAddress("glGetPointerv"); + GetPointerv(pname, params); + } + + static void CODEGEN_FUNCPTR Switch_PolygonOffset(GLfloat factor, GLfloat units) + { + PolygonOffset = (PFNPOLYGONOFFSETPROC)IntGetProcAddress("glPolygonOffset"); + PolygonOffset(factor, units); + } + + static void CODEGEN_FUNCPTR Switch_CopyTexImage1D(GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLint border) + { + CopyTexImage1D = (PFNCOPYTEXIMAGE1DPROC)IntGetProcAddress("glCopyTexImage1D"); + CopyTexImage1D(target, level, internalformat, x, y, width, border); + } + + static void CODEGEN_FUNCPTR Switch_CopyTexImage2D(GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLsizei height, GLint border) + { + CopyTexImage2D = (PFNCOPYTEXIMAGE2DPROC)IntGetProcAddress("glCopyTexImage2D"); + CopyTexImage2D(target, level, internalformat, x, y, width, height, border); + } + + static void CODEGEN_FUNCPTR Switch_CopyTexSubImage1D(GLenum target, GLint level, GLint xoffset, GLint x, GLint y, GLsizei width) + { + CopyTexSubImage1D = (PFNCOPYTEXSUBIMAGE1DPROC)IntGetProcAddress("glCopyTexSubImage1D"); + CopyTexSubImage1D(target, level, xoffset, x, y, width); + } + + static void CODEGEN_FUNCPTR Switch_CopyTexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint x, GLint y, GLsizei width, GLsizei height) + { + CopyTexSubImage2D = (PFNCOPYTEXSUBIMAGE2DPROC)IntGetProcAddress("glCopyTexSubImage2D"); + CopyTexSubImage2D(target, level, xoffset, yoffset, x, y, width, height); + } + + static void CODEGEN_FUNCPTR Switch_TexSubImage1D(GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLenum type, const GLvoid *pixels) + { + TexSubImage1D = (PFNTEXSUBIMAGE1DPROC)IntGetProcAddress("glTexSubImage1D"); + TexSubImage1D(target, level, xoffset, width, format, type, pixels); + } + + static void CODEGEN_FUNCPTR Switch_TexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid *pixels) + { + TexSubImage2D = (PFNTEXSUBIMAGE2DPROC)IntGetProcAddress("glTexSubImage2D"); + TexSubImage2D(target, level, xoffset, yoffset, width, height, format, type, pixels); + } + + static void CODEGEN_FUNCPTR Switch_BindTexture(GLenum target, GLuint texture) + { + BindTexture = (PFNBINDTEXTUREPROC)IntGetProcAddress("glBindTexture"); + BindTexture(target, texture); + } + + static void CODEGEN_FUNCPTR Switch_DeleteTextures(GLsizei n, const GLuint *textures) + { + DeleteTextures = (PFNDELETETEXTURESPROC)IntGetProcAddress("glDeleteTextures"); + DeleteTextures(n, textures); + } + + static void CODEGEN_FUNCPTR Switch_GenTextures(GLsizei n, GLuint *textures) + { + GenTextures = (PFNGENTEXTURESPROC)IntGetProcAddress("glGenTextures"); + GenTextures(n, textures); + } + + static GLboolean CODEGEN_FUNCPTR Switch_IsTexture(GLuint texture) + { + IsTexture = (PFNISTEXTUREPROC)IntGetProcAddress("glIsTexture"); + return IsTexture(texture); + } + + static void CODEGEN_FUNCPTR Switch_Indexub(GLubyte c) + { + Indexub = (PFNINDEXUBPROC)IntGetProcAddress("glIndexub"); + Indexub(c); + } + + static void CODEGEN_FUNCPTR Switch_Indexubv(const GLubyte *c) + { + Indexubv = (PFNINDEXUBVPROC)IntGetProcAddress("glIndexubv"); + Indexubv(c); + } + + // Extension: 1.2 + + static void CODEGEN_FUNCPTR Switch_BlendColor(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha) + { + BlendColor = (PFNBLENDCOLORPROC)IntGetProcAddress("glBlendColor"); + BlendColor(red, green, blue, alpha); + } + + static void CODEGEN_FUNCPTR Switch_BlendEquation(GLenum mode) + { + BlendEquation = (PFNBLENDEQUATIONPROC)IntGetProcAddress("glBlendEquation"); + BlendEquation(mode); + } + + static void CODEGEN_FUNCPTR Switch_DrawRangeElements(GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid *indices) + { + DrawRangeElements = (PFNDRAWRANGEELEMENTSPROC)IntGetProcAddress("glDrawRangeElements"); + DrawRangeElements(mode, start, end, count, type, indices); + } + + static void CODEGEN_FUNCPTR Switch_TexSubImage3D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid *pixels) + { + TexSubImage3D = (PFNTEXSUBIMAGE3DPROC)IntGetProcAddress("glTexSubImage3D"); + TexSubImage3D(target, level, xoffset, yoffset, zoffset, width, height, depth, format, type, pixels); + } + + static void CODEGEN_FUNCPTR Switch_CopyTexSubImage3D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height) + { + CopyTexSubImage3D = (PFNCOPYTEXSUBIMAGE3DPROC)IntGetProcAddress("glCopyTexSubImage3D"); + CopyTexSubImage3D(target, level, xoffset, yoffset, zoffset, x, y, width, height); + } + + // Extension: 1.3 + + static void CODEGEN_FUNCPTR Switch_ActiveTexture(GLenum texture) + { + ActiveTexture = (PFNACTIVETEXTUREPROC)IntGetProcAddress("glActiveTexture"); + ActiveTexture(texture); + } + + static void CODEGEN_FUNCPTR Switch_SampleCoverage(GLfloat value, GLboolean invert) + { + SampleCoverage = (PFNSAMPLECOVERAGEPROC)IntGetProcAddress("glSampleCoverage"); + SampleCoverage(value, invert); + } + + static void CODEGEN_FUNCPTR Switch_CompressedTexImage3D(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid *data) + { + CompressedTexImage3D = (PFNCOMPRESSEDTEXIMAGE3DPROC)IntGetProcAddress("glCompressedTexImage3D"); + CompressedTexImage3D(target, level, internalformat, width, height, depth, border, imageSize, data); + } + + static void CODEGEN_FUNCPTR Switch_CompressedTexImage2D(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const GLvoid *data) + { + CompressedTexImage2D = (PFNCOMPRESSEDTEXIMAGE2DPROC)IntGetProcAddress("glCompressedTexImage2D"); + CompressedTexImage2D(target, level, internalformat, width, height, border, imageSize, data); + } + + static void CODEGEN_FUNCPTR Switch_CompressedTexImage1D(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLint border, GLsizei imageSize, const GLvoid *data) + { + CompressedTexImage1D = (PFNCOMPRESSEDTEXIMAGE1DPROC)IntGetProcAddress("glCompressedTexImage1D"); + CompressedTexImage1D(target, level, internalformat, width, border, imageSize, data); + } + + static void CODEGEN_FUNCPTR Switch_CompressedTexSubImage3D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const GLvoid *data) + { + CompressedTexSubImage3D = (PFNCOMPRESSEDTEXSUBIMAGE3DPROC)IntGetProcAddress("glCompressedTexSubImage3D"); + CompressedTexSubImage3D(target, level, xoffset, yoffset, zoffset, width, height, depth, format, imageSize, data); + } + + static void CODEGEN_FUNCPTR Switch_CompressedTexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLsizei imageSize, const GLvoid *data) + { + CompressedTexSubImage2D = (PFNCOMPRESSEDTEXSUBIMAGE2DPROC)IntGetProcAddress("glCompressedTexSubImage2D"); + CompressedTexSubImage2D(target, level, xoffset, yoffset, width, height, format, imageSize, data); + } + + static void CODEGEN_FUNCPTR Switch_CompressedTexSubImage1D(GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLsizei imageSize, const GLvoid *data) + { + CompressedTexSubImage1D = (PFNCOMPRESSEDTEXSUBIMAGE1DPROC)IntGetProcAddress("glCompressedTexSubImage1D"); + CompressedTexSubImage1D(target, level, xoffset, width, format, imageSize, data); + } + + static void CODEGEN_FUNCPTR Switch_GetCompressedTexImage(GLenum target, GLint level, GLvoid *img) + { + GetCompressedTexImage = (PFNGETCOMPRESSEDTEXIMAGEPROC)IntGetProcAddress("glGetCompressedTexImage"); + GetCompressedTexImage(target, level, img); + } + + // Extension: 1.4 + + static void CODEGEN_FUNCPTR Switch_BlendFuncSeparate(GLenum sfactorRGB, GLenum dfactorRGB, GLenum sfactorAlpha, GLenum dfactorAlpha) + { + BlendFuncSeparate = (PFNBLENDFUNCSEPARATEPROC)IntGetProcAddress("glBlendFuncSeparate"); + BlendFuncSeparate(sfactorRGB, dfactorRGB, sfactorAlpha, dfactorAlpha); + } + + static void CODEGEN_FUNCPTR Switch_MultiDrawArrays(GLenum mode, const GLint *first, const GLsizei *count, GLsizei drawcount) + { + MultiDrawArrays = (PFNMULTIDRAWARRAYSPROC)IntGetProcAddress("glMultiDrawArrays"); + MultiDrawArrays(mode, first, count, drawcount); + } + + static void CODEGEN_FUNCPTR Switch_MultiDrawElements(GLenum mode, const GLsizei *count, GLenum type, const GLvoid* const *indices, GLsizei drawcount) + { + MultiDrawElements = (PFNMULTIDRAWELEMENTSPROC)IntGetProcAddress("glMultiDrawElements"); + MultiDrawElements(mode, count, type, indices, drawcount); + } + + static void CODEGEN_FUNCPTR Switch_PointParameterf(GLenum pname, GLfloat param) + { + PointParameterf = (PFNPOINTPARAMETERFPROC)IntGetProcAddress("glPointParameterf"); + PointParameterf(pname, param); + } + + static void CODEGEN_FUNCPTR Switch_PointParameterfv(GLenum pname, const GLfloat *params) + { + PointParameterfv = (PFNPOINTPARAMETERFVPROC)IntGetProcAddress("glPointParameterfv"); + PointParameterfv(pname, params); + } + + static void CODEGEN_FUNCPTR Switch_PointParameteri(GLenum pname, GLint param) + { + PointParameteri = (PFNPOINTPARAMETERIPROC)IntGetProcAddress("glPointParameteri"); + PointParameteri(pname, param); + } + + static void CODEGEN_FUNCPTR Switch_PointParameteriv(GLenum pname, const GLint *params) + { + PointParameteriv = (PFNPOINTPARAMETERIVPROC)IntGetProcAddress("glPointParameteriv"); + PointParameteriv(pname, params); + } + + // Extension: 1.5 + + static void CODEGEN_FUNCPTR Switch_GenQueries(GLsizei n, GLuint *ids) + { + GenQueries = (PFNGENQUERIESPROC)IntGetProcAddress("glGenQueries"); + GenQueries(n, ids); + } + + static void CODEGEN_FUNCPTR Switch_DeleteQueries(GLsizei n, const GLuint *ids) + { + DeleteQueries = (PFNDELETEQUERIESPROC)IntGetProcAddress("glDeleteQueries"); + DeleteQueries(n, ids); + } + + static GLboolean CODEGEN_FUNCPTR Switch_IsQuery(GLuint id) + { + IsQuery = (PFNISQUERYPROC)IntGetProcAddress("glIsQuery"); + return IsQuery(id); + } + + static void CODEGEN_FUNCPTR Switch_BeginQuery(GLenum target, GLuint id) + { + BeginQuery = (PFNBEGINQUERYPROC)IntGetProcAddress("glBeginQuery"); + BeginQuery(target, id); + } + + static void CODEGEN_FUNCPTR Switch_EndQuery(GLenum target) + { + EndQuery = (PFNENDQUERYPROC)IntGetProcAddress("glEndQuery"); + EndQuery(target); + } + + static void CODEGEN_FUNCPTR Switch_GetQueryiv(GLenum target, GLenum pname, GLint *params) + { + GetQueryiv = (PFNGETQUERYIVPROC)IntGetProcAddress("glGetQueryiv"); + GetQueryiv(target, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetQueryObjectiv(GLuint id, GLenum pname, GLint *params) + { + GetQueryObjectiv = (PFNGETQUERYOBJECTIVPROC)IntGetProcAddress("glGetQueryObjectiv"); + GetQueryObjectiv(id, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetQueryObjectuiv(GLuint id, GLenum pname, GLuint *params) + { + GetQueryObjectuiv = (PFNGETQUERYOBJECTUIVPROC)IntGetProcAddress("glGetQueryObjectuiv"); + GetQueryObjectuiv(id, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_BindBuffer(GLenum target, GLuint buffer) + { + BindBuffer = (PFNBINDBUFFERPROC)IntGetProcAddress("glBindBuffer"); + BindBuffer(target, buffer); + } + + static void CODEGEN_FUNCPTR Switch_DeleteBuffers(GLsizei n, const GLuint *buffers) + { + DeleteBuffers = (PFNDELETEBUFFERSPROC)IntGetProcAddress("glDeleteBuffers"); + DeleteBuffers(n, buffers); + } + + static void CODEGEN_FUNCPTR Switch_GenBuffers(GLsizei n, GLuint *buffers) + { + GenBuffers = (PFNGENBUFFERSPROC)IntGetProcAddress("glGenBuffers"); + GenBuffers(n, buffers); + } + + static GLboolean CODEGEN_FUNCPTR Switch_IsBuffer(GLuint buffer) + { + IsBuffer = (PFNISBUFFERPROC)IntGetProcAddress("glIsBuffer"); + return IsBuffer(buffer); + } + + static void CODEGEN_FUNCPTR Switch_BufferData(GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage) + { + BufferData = (PFNBUFFERDATAPROC)IntGetProcAddress("glBufferData"); + BufferData(target, size, data, usage); + } + + static void CODEGEN_FUNCPTR Switch_BufferSubData(GLenum target, GLintptr offset, GLsizeiptr size, const GLvoid *data) + { + BufferSubData = (PFNBUFFERSUBDATAPROC)IntGetProcAddress("glBufferSubData"); + BufferSubData(target, offset, size, data); + } + + static void CODEGEN_FUNCPTR Switch_GetBufferSubData(GLenum target, GLintptr offset, GLsizeiptr size, GLvoid *data) + { + GetBufferSubData = (PFNGETBUFFERSUBDATAPROC)IntGetProcAddress("glGetBufferSubData"); + GetBufferSubData(target, offset, size, data); + } + + static GLvoid* CODEGEN_FUNCPTR Switch_MapBuffer(GLenum target, GLenum access) + { + MapBuffer = (PFNMAPBUFFERPROC)IntGetProcAddress("glMapBuffer"); + return MapBuffer(target, access); + } + + static GLboolean CODEGEN_FUNCPTR Switch_UnmapBuffer(GLenum target) + { + UnmapBuffer = (PFNUNMAPBUFFERPROC)IntGetProcAddress("glUnmapBuffer"); + return UnmapBuffer(target); + } + + static void CODEGEN_FUNCPTR Switch_GetBufferParameteriv(GLenum target, GLenum pname, GLint *params) + { + GetBufferParameteriv = (PFNGETBUFFERPARAMETERIVPROC)IntGetProcAddress("glGetBufferParameteriv"); + GetBufferParameteriv(target, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetBufferPointerv(GLenum target, GLenum pname, GLvoid* *params) + { + GetBufferPointerv = (PFNGETBUFFERPOINTERVPROC)IntGetProcAddress("glGetBufferPointerv"); + GetBufferPointerv(target, pname, params); + } + + // Extension: 2.0 + + static void CODEGEN_FUNCPTR Switch_BlendEquationSeparate(GLenum modeRGB, GLenum modeAlpha) + { + BlendEquationSeparate = (PFNBLENDEQUATIONSEPARATEPROC)IntGetProcAddress("glBlendEquationSeparate"); + BlendEquationSeparate(modeRGB, modeAlpha); + } + + static void CODEGEN_FUNCPTR Switch_DrawBuffers(GLsizei n, const GLenum *bufs) + { + DrawBuffers = (PFNDRAWBUFFERSPROC)IntGetProcAddress("glDrawBuffers"); + DrawBuffers(n, bufs); + } + + static void CODEGEN_FUNCPTR Switch_StencilOpSeparate(GLenum face, GLenum sfail, GLenum dpfail, GLenum dppass) + { + StencilOpSeparate = (PFNSTENCILOPSEPARATEPROC)IntGetProcAddress("glStencilOpSeparate"); + StencilOpSeparate(face, sfail, dpfail, dppass); + } + + static void CODEGEN_FUNCPTR Switch_StencilFuncSeparate(GLenum face, GLenum func, GLint ref, GLuint mask) + { + StencilFuncSeparate = (PFNSTENCILFUNCSEPARATEPROC)IntGetProcAddress("glStencilFuncSeparate"); + StencilFuncSeparate(face, func, ref, mask); + } + + static void CODEGEN_FUNCPTR Switch_StencilMaskSeparate(GLenum face, GLuint mask) + { + StencilMaskSeparate = (PFNSTENCILMASKSEPARATEPROC)IntGetProcAddress("glStencilMaskSeparate"); + StencilMaskSeparate(face, mask); + } + + static void CODEGEN_FUNCPTR Switch_AttachShader(GLuint program, GLuint shader) + { + AttachShader = (PFNATTACHSHADERPROC)IntGetProcAddress("glAttachShader"); + AttachShader(program, shader); + } + + static void CODEGEN_FUNCPTR Switch_BindAttribLocation(GLuint program, GLuint index, const GLchar *name) + { + BindAttribLocation = (PFNBINDATTRIBLOCATIONPROC)IntGetProcAddress("glBindAttribLocation"); + BindAttribLocation(program, index, name); + } + + static void CODEGEN_FUNCPTR Switch_CompileShader(GLuint shader) + { + CompileShader = (PFNCOMPILESHADERPROC)IntGetProcAddress("glCompileShader"); + CompileShader(shader); + } + + static GLuint CODEGEN_FUNCPTR Switch_CreateProgram() + { + CreateProgram = (PFNCREATEPROGRAMPROC)IntGetProcAddress("glCreateProgram"); + return CreateProgram(); + } + + static GLuint CODEGEN_FUNCPTR Switch_CreateShader(GLenum type) + { + CreateShader = (PFNCREATESHADERPROC)IntGetProcAddress("glCreateShader"); + return CreateShader(type); + } + + static void CODEGEN_FUNCPTR Switch_DeleteProgram(GLuint program) + { + DeleteProgram = (PFNDELETEPROGRAMPROC)IntGetProcAddress("glDeleteProgram"); + DeleteProgram(program); + } + + static void CODEGEN_FUNCPTR Switch_DeleteShader(GLuint shader) + { + DeleteShader = (PFNDELETESHADERPROC)IntGetProcAddress("glDeleteShader"); + DeleteShader(shader); + } + + static void CODEGEN_FUNCPTR Switch_DetachShader(GLuint program, GLuint shader) + { + DetachShader = (PFNDETACHSHADERPROC)IntGetProcAddress("glDetachShader"); + DetachShader(program, shader); + } + + static void CODEGEN_FUNCPTR Switch_DisableVertexAttribArray(GLuint index) + { + DisableVertexAttribArray = (PFNDISABLEVERTEXATTRIBARRAYPROC)IntGetProcAddress("glDisableVertexAttribArray"); + DisableVertexAttribArray(index); + } + + static void CODEGEN_FUNCPTR Switch_EnableVertexAttribArray(GLuint index) + { + EnableVertexAttribArray = (PFNENABLEVERTEXATTRIBARRAYPROC)IntGetProcAddress("glEnableVertexAttribArray"); + EnableVertexAttribArray(index); + } + + static void CODEGEN_FUNCPTR Switch_GetActiveAttrib(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLint *size, GLenum *type, GLchar *name) + { + GetActiveAttrib = (PFNGETACTIVEATTRIBPROC)IntGetProcAddress("glGetActiveAttrib"); + GetActiveAttrib(program, index, bufSize, length, size, type, name); + } + + static void CODEGEN_FUNCPTR Switch_GetActiveUniform(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLint *size, GLenum *type, GLchar *name) + { + GetActiveUniform = (PFNGETACTIVEUNIFORMPROC)IntGetProcAddress("glGetActiveUniform"); + GetActiveUniform(program, index, bufSize, length, size, type, name); + } + + static void CODEGEN_FUNCPTR Switch_GetAttachedShaders(GLuint program, GLsizei maxCount, GLsizei *count, GLuint *obj) + { + GetAttachedShaders = (PFNGETATTACHEDSHADERSPROC)IntGetProcAddress("glGetAttachedShaders"); + GetAttachedShaders(program, maxCount, count, obj); + } + + static GLint CODEGEN_FUNCPTR Switch_GetAttribLocation(GLuint program, const GLchar *name) + { + GetAttribLocation = (PFNGETATTRIBLOCATIONPROC)IntGetProcAddress("glGetAttribLocation"); + return GetAttribLocation(program, name); + } + + static void CODEGEN_FUNCPTR Switch_GetProgramiv(GLuint program, GLenum pname, GLint *params) + { + GetProgramiv = (PFNGETPROGRAMIVPROC)IntGetProcAddress("glGetProgramiv"); + GetProgramiv(program, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetProgramInfoLog(GLuint program, GLsizei bufSize, GLsizei *length, GLchar *infoLog) + { + GetProgramInfoLog = (PFNGETPROGRAMINFOLOGPROC)IntGetProcAddress("glGetProgramInfoLog"); + GetProgramInfoLog(program, bufSize, length, infoLog); + } + + static void CODEGEN_FUNCPTR Switch_GetShaderiv(GLuint shader, GLenum pname, GLint *params) + { + GetShaderiv = (PFNGETSHADERIVPROC)IntGetProcAddress("glGetShaderiv"); + GetShaderiv(shader, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetShaderInfoLog(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *infoLog) + { + GetShaderInfoLog = (PFNGETSHADERINFOLOGPROC)IntGetProcAddress("glGetShaderInfoLog"); + GetShaderInfoLog(shader, bufSize, length, infoLog); + } + + static void CODEGEN_FUNCPTR Switch_GetShaderSource(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *source) + { + GetShaderSource = (PFNGETSHADERSOURCEPROC)IntGetProcAddress("glGetShaderSource"); + GetShaderSource(shader, bufSize, length, source); + } + + static GLint CODEGEN_FUNCPTR Switch_GetUniformLocation(GLuint program, const GLchar *name) + { + GetUniformLocation = (PFNGETUNIFORMLOCATIONPROC)IntGetProcAddress("glGetUniformLocation"); + return GetUniformLocation(program, name); + } + + static void CODEGEN_FUNCPTR Switch_GetUniformfv(GLuint program, GLint location, GLfloat *params) + { + GetUniformfv = (PFNGETUNIFORMFVPROC)IntGetProcAddress("glGetUniformfv"); + GetUniformfv(program, location, params); + } + + static void CODEGEN_FUNCPTR Switch_GetUniformiv(GLuint program, GLint location, GLint *params) + { + GetUniformiv = (PFNGETUNIFORMIVPROC)IntGetProcAddress("glGetUniformiv"); + GetUniformiv(program, location, params); + } + + static void CODEGEN_FUNCPTR Switch_GetVertexAttribdv(GLuint index, GLenum pname, GLdouble *params) + { + GetVertexAttribdv = (PFNGETVERTEXATTRIBDVPROC)IntGetProcAddress("glGetVertexAttribdv"); + GetVertexAttribdv(index, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetVertexAttribfv(GLuint index, GLenum pname, GLfloat *params) + { + GetVertexAttribfv = (PFNGETVERTEXATTRIBFVPROC)IntGetProcAddress("glGetVertexAttribfv"); + GetVertexAttribfv(index, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetVertexAttribiv(GLuint index, GLenum pname, GLint *params) + { + GetVertexAttribiv = (PFNGETVERTEXATTRIBIVPROC)IntGetProcAddress("glGetVertexAttribiv"); + GetVertexAttribiv(index, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetVertexAttribPointerv(GLuint index, GLenum pname, GLvoid* *pointer) + { + GetVertexAttribPointerv = (PFNGETVERTEXATTRIBPOINTERVPROC)IntGetProcAddress("glGetVertexAttribPointerv"); + GetVertexAttribPointerv(index, pname, pointer); + } + + static GLboolean CODEGEN_FUNCPTR Switch_IsProgram(GLuint program) + { + IsProgram = (PFNISPROGRAMPROC)IntGetProcAddress("glIsProgram"); + return IsProgram(program); + } + + static GLboolean CODEGEN_FUNCPTR Switch_IsShader(GLuint shader) + { + IsShader = (PFNISSHADERPROC)IntGetProcAddress("glIsShader"); + return IsShader(shader); + } + + static void CODEGEN_FUNCPTR Switch_LinkProgram(GLuint program) + { + LinkProgram = (PFNLINKPROGRAMPROC)IntGetProcAddress("glLinkProgram"); + LinkProgram(program); + } + + static void CODEGEN_FUNCPTR Switch_ShaderSource(GLuint shader, GLsizei count, const GLchar* const *string, const GLint *length) + { + ShaderSource = (PFNSHADERSOURCEPROC)IntGetProcAddress("glShaderSource"); + ShaderSource(shader, count, string, length); + } + + static void CODEGEN_FUNCPTR Switch_UseProgram(GLuint program) + { + UseProgram = (PFNUSEPROGRAMPROC)IntGetProcAddress("glUseProgram"); + UseProgram(program); + } + + static void CODEGEN_FUNCPTR Switch_Uniform1f(GLint location, GLfloat v0) + { + Uniform1f = (PFNUNIFORM1FPROC)IntGetProcAddress("glUniform1f"); + Uniform1f(location, v0); + } + + static void CODEGEN_FUNCPTR Switch_Uniform2f(GLint location, GLfloat v0, GLfloat v1) + { + Uniform2f = (PFNUNIFORM2FPROC)IntGetProcAddress("glUniform2f"); + Uniform2f(location, v0, v1); + } + + static void CODEGEN_FUNCPTR Switch_Uniform3f(GLint location, GLfloat v0, GLfloat v1, GLfloat v2) + { + Uniform3f = (PFNUNIFORM3FPROC)IntGetProcAddress("glUniform3f"); + Uniform3f(location, v0, v1, v2); + } + + static void CODEGEN_FUNCPTR Switch_Uniform4f(GLint location, GLfloat v0, GLfloat v1, GLfloat v2, GLfloat v3) + { + Uniform4f = (PFNUNIFORM4FPROC)IntGetProcAddress("glUniform4f"); + Uniform4f(location, v0, v1, v2, v3); + } + + static void CODEGEN_FUNCPTR Switch_Uniform1i(GLint location, GLint v0) + { + Uniform1i = (PFNUNIFORM1IPROC)IntGetProcAddress("glUniform1i"); + Uniform1i(location, v0); + } + + static void CODEGEN_FUNCPTR Switch_Uniform2i(GLint location, GLint v0, GLint v1) + { + Uniform2i = (PFNUNIFORM2IPROC)IntGetProcAddress("glUniform2i"); + Uniform2i(location, v0, v1); + } + + static void CODEGEN_FUNCPTR Switch_Uniform3i(GLint location, GLint v0, GLint v1, GLint v2) + { + Uniform3i = (PFNUNIFORM3IPROC)IntGetProcAddress("glUniform3i"); + Uniform3i(location, v0, v1, v2); + } + + static void CODEGEN_FUNCPTR Switch_Uniform4i(GLint location, GLint v0, GLint v1, GLint v2, GLint v3) + { + Uniform4i = (PFNUNIFORM4IPROC)IntGetProcAddress("glUniform4i"); + Uniform4i(location, v0, v1, v2, v3); + } + + static void CODEGEN_FUNCPTR Switch_Uniform1fv(GLint location, GLsizei count, const GLfloat *value) + { + Uniform1fv = (PFNUNIFORM1FVPROC)IntGetProcAddress("glUniform1fv"); + Uniform1fv(location, count, value); + } + + static void CODEGEN_FUNCPTR Switch_Uniform2fv(GLint location, GLsizei count, const GLfloat *value) + { + Uniform2fv = (PFNUNIFORM2FVPROC)IntGetProcAddress("glUniform2fv"); + Uniform2fv(location, count, value); + } + + static void CODEGEN_FUNCPTR Switch_Uniform3fv(GLint location, GLsizei count, const GLfloat *value) + { + Uniform3fv = (PFNUNIFORM3FVPROC)IntGetProcAddress("glUniform3fv"); + Uniform3fv(location, count, value); + } + + static void CODEGEN_FUNCPTR Switch_Uniform4fv(GLint location, GLsizei count, const GLfloat *value) + { + Uniform4fv = (PFNUNIFORM4FVPROC)IntGetProcAddress("glUniform4fv"); + Uniform4fv(location, count, value); + } + + static void CODEGEN_FUNCPTR Switch_Uniform1iv(GLint location, GLsizei count, const GLint *value) + { + Uniform1iv = (PFNUNIFORM1IVPROC)IntGetProcAddress("glUniform1iv"); + Uniform1iv(location, count, value); + } + + static void CODEGEN_FUNCPTR Switch_Uniform2iv(GLint location, GLsizei count, const GLint *value) + { + Uniform2iv = (PFNUNIFORM2IVPROC)IntGetProcAddress("glUniform2iv"); + Uniform2iv(location, count, value); + } + + static void CODEGEN_FUNCPTR Switch_Uniform3iv(GLint location, GLsizei count, const GLint *value) + { + Uniform3iv = (PFNUNIFORM3IVPROC)IntGetProcAddress("glUniform3iv"); + Uniform3iv(location, count, value); + } + + static void CODEGEN_FUNCPTR Switch_Uniform4iv(GLint location, GLsizei count, const GLint *value) + { + Uniform4iv = (PFNUNIFORM4IVPROC)IntGetProcAddress("glUniform4iv"); + Uniform4iv(location, count, value); + } + + static void CODEGEN_FUNCPTR Switch_UniformMatrix2fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value) + { + UniformMatrix2fv = (PFNUNIFORMMATRIX2FVPROC)IntGetProcAddress("glUniformMatrix2fv"); + UniformMatrix2fv(location, count, transpose, value); + } + + static void CODEGEN_FUNCPTR Switch_UniformMatrix3fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value) + { + UniformMatrix3fv = (PFNUNIFORMMATRIX3FVPROC)IntGetProcAddress("glUniformMatrix3fv"); + UniformMatrix3fv(location, count, transpose, value); + } + + static void CODEGEN_FUNCPTR Switch_UniformMatrix4fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value) + { + UniformMatrix4fv = (PFNUNIFORMMATRIX4FVPROC)IntGetProcAddress("glUniformMatrix4fv"); + UniformMatrix4fv(location, count, transpose, value); + } + + static void CODEGEN_FUNCPTR Switch_ValidateProgram(GLuint program) + { + ValidateProgram = (PFNVALIDATEPROGRAMPROC)IntGetProcAddress("glValidateProgram"); + ValidateProgram(program); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribPointer(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const GLvoid *pointer) + { + VertexAttribPointer = (PFNVERTEXATTRIBPOINTERPROC)IntGetProcAddress("glVertexAttribPointer"); + VertexAttribPointer(index, size, type, normalized, stride, pointer); + } + + // Extension: 2.1 + + static void CODEGEN_FUNCPTR Switch_UniformMatrix2x3fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value) + { + UniformMatrix2x3fv = (PFNUNIFORMMATRIX2X3FVPROC)IntGetProcAddress("glUniformMatrix2x3fv"); + UniformMatrix2x3fv(location, count, transpose, value); + } + + static void CODEGEN_FUNCPTR Switch_UniformMatrix3x2fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value) + { + UniformMatrix3x2fv = (PFNUNIFORMMATRIX3X2FVPROC)IntGetProcAddress("glUniformMatrix3x2fv"); + UniformMatrix3x2fv(location, count, transpose, value); + } + + static void CODEGEN_FUNCPTR Switch_UniformMatrix2x4fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value) + { + UniformMatrix2x4fv = (PFNUNIFORMMATRIX2X4FVPROC)IntGetProcAddress("glUniformMatrix2x4fv"); + UniformMatrix2x4fv(location, count, transpose, value); + } + + static void CODEGEN_FUNCPTR Switch_UniformMatrix4x2fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value) + { + UniformMatrix4x2fv = (PFNUNIFORMMATRIX4X2FVPROC)IntGetProcAddress("glUniformMatrix4x2fv"); + UniformMatrix4x2fv(location, count, transpose, value); + } + + static void CODEGEN_FUNCPTR Switch_UniformMatrix3x4fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value) + { + UniformMatrix3x4fv = (PFNUNIFORMMATRIX3X4FVPROC)IntGetProcAddress("glUniformMatrix3x4fv"); + UniformMatrix3x4fv(location, count, transpose, value); + } + + static void CODEGEN_FUNCPTR Switch_UniformMatrix4x3fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value) + { + UniformMatrix4x3fv = (PFNUNIFORMMATRIX4X3FVPROC)IntGetProcAddress("glUniformMatrix4x3fv"); + UniformMatrix4x3fv(location, count, transpose, value); + } + + // Extension: ARB_vertex_array_object + + static void CODEGEN_FUNCPTR Switch_BindVertexArray(GLuint ren_array) + { + BindVertexArray = (PFNBINDVERTEXARRAYPROC)IntGetProcAddress("glBindVertexArray"); + BindVertexArray(ren_array); + } + + static void CODEGEN_FUNCPTR Switch_DeleteVertexArrays(GLsizei n, const GLuint *arrays) + { + DeleteVertexArrays = (PFNDELETEVERTEXARRAYSPROC)IntGetProcAddress("glDeleteVertexArrays"); + DeleteVertexArrays(n, arrays); + } + + static void CODEGEN_FUNCPTR Switch_GenVertexArrays(GLsizei n, GLuint *arrays) + { + GenVertexArrays = (PFNGENVERTEXARRAYSPROC)IntGetProcAddress("glGenVertexArrays"); + GenVertexArrays(n, arrays); + } + + static GLboolean CODEGEN_FUNCPTR Switch_IsVertexArray(GLuint ren_array) + { + IsVertexArray = (PFNISVERTEXARRAYPROC)IntGetProcAddress("glIsVertexArray"); + return IsVertexArray(ren_array); + } + + // Extension: ARB_map_buffer_range + + static GLvoid* CODEGEN_FUNCPTR Switch_MapBufferRange(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access) + { + MapBufferRange = (PFNMAPBUFFERRANGEPROC)IntGetProcAddress("glMapBufferRange"); + return MapBufferRange(target, offset, length, access); + } + + static void CODEGEN_FUNCPTR Switch_FlushMappedBufferRange(GLenum target, GLintptr offset, GLsizeiptr length) + { + FlushMappedBufferRange = (PFNFLUSHMAPPEDBUFFERRANGEPROC)IntGetProcAddress("glFlushMappedBufferRange"); + FlushMappedBufferRange(target, offset, length); + } + + // Extension: ARB_framebuffer_object + + static GLboolean CODEGEN_FUNCPTR Switch_IsRenderbuffer(GLuint renderbuffer) + { + IsRenderbuffer = (PFNISRENDERBUFFERPROC)IntGetProcAddress("glIsRenderbuffer"); + return IsRenderbuffer(renderbuffer); + } + + static void CODEGEN_FUNCPTR Switch_BindRenderbuffer(GLenum target, GLuint renderbuffer) + { + BindRenderbuffer = (PFNBINDRENDERBUFFERPROC)IntGetProcAddress("glBindRenderbuffer"); + BindRenderbuffer(target, renderbuffer); + } + + static void CODEGEN_FUNCPTR Switch_DeleteRenderbuffers(GLsizei n, const GLuint *renderbuffers) + { + DeleteRenderbuffers = (PFNDELETERENDERBUFFERSPROC)IntGetProcAddress("glDeleteRenderbuffers"); + DeleteRenderbuffers(n, renderbuffers); + } + + static void CODEGEN_FUNCPTR Switch_GenRenderbuffers(GLsizei n, GLuint *renderbuffers) + { + GenRenderbuffers = (PFNGENRENDERBUFFERSPROC)IntGetProcAddress("glGenRenderbuffers"); + GenRenderbuffers(n, renderbuffers); + } + + static void CODEGEN_FUNCPTR Switch_RenderbufferStorage(GLenum target, GLenum internalformat, GLsizei width, GLsizei height) + { + RenderbufferStorage = (PFNRENDERBUFFERSTORAGEPROC)IntGetProcAddress("glRenderbufferStorage"); + RenderbufferStorage(target, internalformat, width, height); + } + + static void CODEGEN_FUNCPTR Switch_GetRenderbufferParameteriv(GLenum target, GLenum pname, GLint *params) + { + GetRenderbufferParameteriv = (PFNGETRENDERBUFFERPARAMETERIVPROC)IntGetProcAddress("glGetRenderbufferParameteriv"); + GetRenderbufferParameteriv(target, pname, params); + } + + static GLboolean CODEGEN_FUNCPTR Switch_IsFramebuffer(GLuint framebuffer) + { + IsFramebuffer = (PFNISFRAMEBUFFERPROC)IntGetProcAddress("glIsFramebuffer"); + return IsFramebuffer(framebuffer); + } + + static void CODEGEN_FUNCPTR Switch_BindFramebuffer(GLenum target, GLuint framebuffer) + { + BindFramebuffer = (PFNBINDFRAMEBUFFERPROC)IntGetProcAddress("glBindFramebuffer"); + BindFramebuffer(target, framebuffer); + } + + static void CODEGEN_FUNCPTR Switch_DeleteFramebuffers(GLsizei n, const GLuint *framebuffers) + { + DeleteFramebuffers = (PFNDELETEFRAMEBUFFERSPROC)IntGetProcAddress("glDeleteFramebuffers"); + DeleteFramebuffers(n, framebuffers); + } + + static void CODEGEN_FUNCPTR Switch_GenFramebuffers(GLsizei n, GLuint *framebuffers) + { + GenFramebuffers = (PFNGENFRAMEBUFFERSPROC)IntGetProcAddress("glGenFramebuffers"); + GenFramebuffers(n, framebuffers); + } + + static GLenum CODEGEN_FUNCPTR Switch_CheckFramebufferStatus(GLenum target) + { + CheckFramebufferStatus = (PFNCHECKFRAMEBUFFERSTATUSPROC)IntGetProcAddress("glCheckFramebufferStatus"); + return CheckFramebufferStatus(target); + } + + static void CODEGEN_FUNCPTR Switch_FramebufferTexture1D(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level) + { + FramebufferTexture1D = (PFNFRAMEBUFFERTEXTURE1DPROC)IntGetProcAddress("glFramebufferTexture1D"); + FramebufferTexture1D(target, attachment, textarget, texture, level); + } + + static void CODEGEN_FUNCPTR Switch_FramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level) + { + FramebufferTexture2D = (PFNFRAMEBUFFERTEXTURE2DPROC)IntGetProcAddress("glFramebufferTexture2D"); + FramebufferTexture2D(target, attachment, textarget, texture, level); + } + + static void CODEGEN_FUNCPTR Switch_FramebufferTexture3D(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLint zoffset) + { + FramebufferTexture3D = (PFNFRAMEBUFFERTEXTURE3DPROC)IntGetProcAddress("glFramebufferTexture3D"); + FramebufferTexture3D(target, attachment, textarget, texture, level, zoffset); + } + + static void CODEGEN_FUNCPTR Switch_FramebufferRenderbuffer(GLenum target, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer) + { + FramebufferRenderbuffer = (PFNFRAMEBUFFERRENDERBUFFERPROC)IntGetProcAddress("glFramebufferRenderbuffer"); + FramebufferRenderbuffer(target, attachment, renderbuffertarget, renderbuffer); + } + + static void CODEGEN_FUNCPTR Switch_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment, GLenum pname, GLint *params) + { + GetFramebufferAttachmentParameteriv = (PFNGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC)IntGetProcAddress("glGetFramebufferAttachmentParameteriv"); + GetFramebufferAttachmentParameteriv(target, attachment, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GenerateMipmap(GLenum target) + { + GenerateMipmap = (PFNGENERATEMIPMAPPROC)IntGetProcAddress("glGenerateMipmap"); + GenerateMipmap(target); + } + + static void CODEGEN_FUNCPTR Switch_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter) + { + BlitFramebuffer = (PFNBLITFRAMEBUFFERPROC)IntGetProcAddress("glBlitFramebuffer"); + BlitFramebuffer(srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, mask, filter); + } + + static void CODEGEN_FUNCPTR Switch_RenderbufferStorageMultisample(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height) + { + RenderbufferStorageMultisample = (PFNRENDERBUFFERSTORAGEMULTISAMPLEPROC)IntGetProcAddress("glRenderbufferStorageMultisample"); + RenderbufferStorageMultisample(target, samples, internalformat, width, height); + } + + static void CODEGEN_FUNCPTR Switch_FramebufferTextureLayer(GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer) + { + FramebufferTextureLayer = (PFNFRAMEBUFFERTEXTURELAYERPROC)IntGetProcAddress("glFramebufferTextureLayer"); + FramebufferTextureLayer(target, attachment, texture, level, layer); + } + + // Extension: 3.0 + + static void CODEGEN_FUNCPTR Switch_ColorMaski(GLuint index, GLboolean r, GLboolean g, GLboolean b, GLboolean a) + { + ColorMaski = (PFNCOLORMASKIPROC)IntGetProcAddress("glColorMaski"); + ColorMaski(index, r, g, b, a); + } + + static void CODEGEN_FUNCPTR Switch_GetBooleani_v(GLenum target, GLuint index, GLboolean *data) + { + GetBooleani_v = (PFNGETBOOLEANI_VPROC)IntGetProcAddress("glGetBooleani_v"); + GetBooleani_v(target, index, data); + } + + static void CODEGEN_FUNCPTR Switch_GetIntegeri_v(GLenum target, GLuint index, GLint *data) + { + GetIntegeri_v = (PFNGETINTEGERI_VPROC)IntGetProcAddress("glGetIntegeri_v"); + GetIntegeri_v(target, index, data); + } + + static void CODEGEN_FUNCPTR Switch_Enablei(GLenum target, GLuint index) + { + Enablei = (PFNENABLEIPROC)IntGetProcAddress("glEnablei"); + Enablei(target, index); + } + + static void CODEGEN_FUNCPTR Switch_Disablei(GLenum target, GLuint index) + { + Disablei = (PFNDISABLEIPROC)IntGetProcAddress("glDisablei"); + Disablei(target, index); + } + + static GLboolean CODEGEN_FUNCPTR Switch_IsEnabledi(GLenum target, GLuint index) + { + IsEnabledi = (PFNISENABLEDIPROC)IntGetProcAddress("glIsEnabledi"); + return IsEnabledi(target, index); + } + + static void CODEGEN_FUNCPTR Switch_BeginTransformFeedback(GLenum primitiveMode) + { + BeginTransformFeedback = (PFNBEGINTRANSFORMFEEDBACKPROC)IntGetProcAddress("glBeginTransformFeedback"); + BeginTransformFeedback(primitiveMode); + } + + static void CODEGEN_FUNCPTR Switch_EndTransformFeedback() + { + EndTransformFeedback = (PFNENDTRANSFORMFEEDBACKPROC)IntGetProcAddress("glEndTransformFeedback"); + EndTransformFeedback(); + } + + static void CODEGEN_FUNCPTR Switch_BindBufferRange(GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size) + { + BindBufferRange = (PFNBINDBUFFERRANGEPROC)IntGetProcAddress("glBindBufferRange"); + BindBufferRange(target, index, buffer, offset, size); + } + + static void CODEGEN_FUNCPTR Switch_BindBufferBase(GLenum target, GLuint index, GLuint buffer) + { + BindBufferBase = (PFNBINDBUFFERBASEPROC)IntGetProcAddress("glBindBufferBase"); + BindBufferBase(target, index, buffer); + } + + static void CODEGEN_FUNCPTR Switch_TransformFeedbackVaryings(GLuint program, GLsizei count, const GLchar* const *varyings, GLenum bufferMode) + { + TransformFeedbackVaryings = (PFNTRANSFORMFEEDBACKVARYINGSPROC)IntGetProcAddress("glTransformFeedbackVaryings"); + TransformFeedbackVaryings(program, count, varyings, bufferMode); + } + + static void CODEGEN_FUNCPTR Switch_GetTransformFeedbackVarying(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLsizei *size, GLenum *type, GLchar *name) + { + GetTransformFeedbackVarying = (PFNGETTRANSFORMFEEDBACKVARYINGPROC)IntGetProcAddress("glGetTransformFeedbackVarying"); + GetTransformFeedbackVarying(program, index, bufSize, length, size, type, name); + } + + static void CODEGEN_FUNCPTR Switch_ClampColor(GLenum target, GLenum clamp) + { + ClampColor = (PFNCLAMPCOLORPROC)IntGetProcAddress("glClampColor"); + ClampColor(target, clamp); + } + + static void CODEGEN_FUNCPTR Switch_BeginConditionalRender(GLuint id, GLenum mode) + { + BeginConditionalRender = (PFNBEGINCONDITIONALRENDERPROC)IntGetProcAddress("glBeginConditionalRender"); + BeginConditionalRender(id, mode); + } + + static void CODEGEN_FUNCPTR Switch_EndConditionalRender() + { + EndConditionalRender = (PFNENDCONDITIONALRENDERPROC)IntGetProcAddress("glEndConditionalRender"); + EndConditionalRender(); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribIPointer(GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid *pointer) + { + VertexAttribIPointer = (PFNVERTEXATTRIBIPOINTERPROC)IntGetProcAddress("glVertexAttribIPointer"); + VertexAttribIPointer(index, size, type, stride, pointer); + } + + static void CODEGEN_FUNCPTR Switch_GetVertexAttribIiv(GLuint index, GLenum pname, GLint *params) + { + GetVertexAttribIiv = (PFNGETVERTEXATTRIBIIVPROC)IntGetProcAddress("glGetVertexAttribIiv"); + GetVertexAttribIiv(index, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetVertexAttribIuiv(GLuint index, GLenum pname, GLuint *params) + { + GetVertexAttribIuiv = (PFNGETVERTEXATTRIBIUIVPROC)IntGetProcAddress("glGetVertexAttribIuiv"); + GetVertexAttribIuiv(index, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI1i(GLuint index, GLint x) + { + VertexAttribI1i = (PFNVERTEXATTRIBI1IPROC)IntGetProcAddress("glVertexAttribI1i"); + VertexAttribI1i(index, x); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI2i(GLuint index, GLint x, GLint y) + { + VertexAttribI2i = (PFNVERTEXATTRIBI2IPROC)IntGetProcAddress("glVertexAttribI2i"); + VertexAttribI2i(index, x, y); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI3i(GLuint index, GLint x, GLint y, GLint z) + { + VertexAttribI3i = (PFNVERTEXATTRIBI3IPROC)IntGetProcAddress("glVertexAttribI3i"); + VertexAttribI3i(index, x, y, z); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI4i(GLuint index, GLint x, GLint y, GLint z, GLint w) + { + VertexAttribI4i = (PFNVERTEXATTRIBI4IPROC)IntGetProcAddress("glVertexAttribI4i"); + VertexAttribI4i(index, x, y, z, w); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI1ui(GLuint index, GLuint x) + { + VertexAttribI1ui = (PFNVERTEXATTRIBI1UIPROC)IntGetProcAddress("glVertexAttribI1ui"); + VertexAttribI1ui(index, x); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI2ui(GLuint index, GLuint x, GLuint y) + { + VertexAttribI2ui = (PFNVERTEXATTRIBI2UIPROC)IntGetProcAddress("glVertexAttribI2ui"); + VertexAttribI2ui(index, x, y); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI3ui(GLuint index, GLuint x, GLuint y, GLuint z) + { + VertexAttribI3ui = (PFNVERTEXATTRIBI3UIPROC)IntGetProcAddress("glVertexAttribI3ui"); + VertexAttribI3ui(index, x, y, z); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI4ui(GLuint index, GLuint x, GLuint y, GLuint z, GLuint w) + { + VertexAttribI4ui = (PFNVERTEXATTRIBI4UIPROC)IntGetProcAddress("glVertexAttribI4ui"); + VertexAttribI4ui(index, x, y, z, w); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI1iv(GLuint index, const GLint *v) + { + VertexAttribI1iv = (PFNVERTEXATTRIBI1IVPROC)IntGetProcAddress("glVertexAttribI1iv"); + VertexAttribI1iv(index, v); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI2iv(GLuint index, const GLint *v) + { + VertexAttribI2iv = (PFNVERTEXATTRIBI2IVPROC)IntGetProcAddress("glVertexAttribI2iv"); + VertexAttribI2iv(index, v); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI3iv(GLuint index, const GLint *v) + { + VertexAttribI3iv = (PFNVERTEXATTRIBI3IVPROC)IntGetProcAddress("glVertexAttribI3iv"); + VertexAttribI3iv(index, v); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI4iv(GLuint index, const GLint *v) + { + VertexAttribI4iv = (PFNVERTEXATTRIBI4IVPROC)IntGetProcAddress("glVertexAttribI4iv"); + VertexAttribI4iv(index, v); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI1uiv(GLuint index, const GLuint *v) + { + VertexAttribI1uiv = (PFNVERTEXATTRIBI1UIVPROC)IntGetProcAddress("glVertexAttribI1uiv"); + VertexAttribI1uiv(index, v); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI2uiv(GLuint index, const GLuint *v) + { + VertexAttribI2uiv = (PFNVERTEXATTRIBI2UIVPROC)IntGetProcAddress("glVertexAttribI2uiv"); + VertexAttribI2uiv(index, v); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI3uiv(GLuint index, const GLuint *v) + { + VertexAttribI3uiv = (PFNVERTEXATTRIBI3UIVPROC)IntGetProcAddress("glVertexAttribI3uiv"); + VertexAttribI3uiv(index, v); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI4uiv(GLuint index, const GLuint *v) + { + VertexAttribI4uiv = (PFNVERTEXATTRIBI4UIVPROC)IntGetProcAddress("glVertexAttribI4uiv"); + VertexAttribI4uiv(index, v); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI4bv(GLuint index, const GLbyte *v) + { + VertexAttribI4bv = (PFNVERTEXATTRIBI4BVPROC)IntGetProcAddress("glVertexAttribI4bv"); + VertexAttribI4bv(index, v); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI4sv(GLuint index, const GLshort *v) + { + VertexAttribI4sv = (PFNVERTEXATTRIBI4SVPROC)IntGetProcAddress("glVertexAttribI4sv"); + VertexAttribI4sv(index, v); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI4ubv(GLuint index, const GLubyte *v) + { + VertexAttribI4ubv = (PFNVERTEXATTRIBI4UBVPROC)IntGetProcAddress("glVertexAttribI4ubv"); + VertexAttribI4ubv(index, v); + } + + static void CODEGEN_FUNCPTR Switch_VertexAttribI4usv(GLuint index, const GLushort *v) + { + VertexAttribI4usv = (PFNVERTEXATTRIBI4USVPROC)IntGetProcAddress("glVertexAttribI4usv"); + VertexAttribI4usv(index, v); + } + + static void CODEGEN_FUNCPTR Switch_GetUniformuiv(GLuint program, GLint location, GLuint *params) + { + GetUniformuiv = (PFNGETUNIFORMUIVPROC)IntGetProcAddress("glGetUniformuiv"); + GetUniformuiv(program, location, params); + } + + static void CODEGEN_FUNCPTR Switch_BindFragDataLocation(GLuint program, GLuint color, const GLchar *name) + { + BindFragDataLocation = (PFNBINDFRAGDATALOCATIONPROC)IntGetProcAddress("glBindFragDataLocation"); + BindFragDataLocation(program, color, name); + } + + static GLint CODEGEN_FUNCPTR Switch_GetFragDataLocation(GLuint program, const GLchar *name) + { + GetFragDataLocation = (PFNGETFRAGDATALOCATIONPROC)IntGetProcAddress("glGetFragDataLocation"); + return GetFragDataLocation(program, name); + } + + static void CODEGEN_FUNCPTR Switch_Uniform1ui(GLint location, GLuint v0) + { + Uniform1ui = (PFNUNIFORM1UIPROC)IntGetProcAddress("glUniform1ui"); + Uniform1ui(location, v0); + } + + static void CODEGEN_FUNCPTR Switch_Uniform2ui(GLint location, GLuint v0, GLuint v1) + { + Uniform2ui = (PFNUNIFORM2UIPROC)IntGetProcAddress("glUniform2ui"); + Uniform2ui(location, v0, v1); + } + + static void CODEGEN_FUNCPTR Switch_Uniform3ui(GLint location, GLuint v0, GLuint v1, GLuint v2) + { + Uniform3ui = (PFNUNIFORM3UIPROC)IntGetProcAddress("glUniform3ui"); + Uniform3ui(location, v0, v1, v2); + } + + static void CODEGEN_FUNCPTR Switch_Uniform4ui(GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3) + { + Uniform4ui = (PFNUNIFORM4UIPROC)IntGetProcAddress("glUniform4ui"); + Uniform4ui(location, v0, v1, v2, v3); + } + + static void CODEGEN_FUNCPTR Switch_Uniform1uiv(GLint location, GLsizei count, const GLuint *value) + { + Uniform1uiv = (PFNUNIFORM1UIVPROC)IntGetProcAddress("glUniform1uiv"); + Uniform1uiv(location, count, value); + } + + static void CODEGEN_FUNCPTR Switch_Uniform2uiv(GLint location, GLsizei count, const GLuint *value) + { + Uniform2uiv = (PFNUNIFORM2UIVPROC)IntGetProcAddress("glUniform2uiv"); + Uniform2uiv(location, count, value); + } + + static void CODEGEN_FUNCPTR Switch_Uniform3uiv(GLint location, GLsizei count, const GLuint *value) + { + Uniform3uiv = (PFNUNIFORM3UIVPROC)IntGetProcAddress("glUniform3uiv"); + Uniform3uiv(location, count, value); + } + + static void CODEGEN_FUNCPTR Switch_Uniform4uiv(GLint location, GLsizei count, const GLuint *value) + { + Uniform4uiv = (PFNUNIFORM4UIVPROC)IntGetProcAddress("glUniform4uiv"); + Uniform4uiv(location, count, value); + } + + static void CODEGEN_FUNCPTR Switch_TexParameterIiv(GLenum target, GLenum pname, const GLint *params) + { + TexParameterIiv = (PFNTEXPARAMETERIIVPROC)IntGetProcAddress("glTexParameterIiv"); + TexParameterIiv(target, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_TexParameterIuiv(GLenum target, GLenum pname, const GLuint *params) + { + TexParameterIuiv = (PFNTEXPARAMETERIUIVPROC)IntGetProcAddress("glTexParameterIuiv"); + TexParameterIuiv(target, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetTexParameterIiv(GLenum target, GLenum pname, GLint *params) + { + GetTexParameterIiv = (PFNGETTEXPARAMETERIIVPROC)IntGetProcAddress("glGetTexParameterIiv"); + GetTexParameterIiv(target, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetTexParameterIuiv(GLenum target, GLenum pname, GLuint *params) + { + GetTexParameterIuiv = (PFNGETTEXPARAMETERIUIVPROC)IntGetProcAddress("glGetTexParameterIuiv"); + GetTexParameterIuiv(target, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_ClearBufferiv(GLenum buffer, GLint drawbuffer, const GLint *value) + { + ClearBufferiv = (PFNCLEARBUFFERIVPROC)IntGetProcAddress("glClearBufferiv"); + ClearBufferiv(buffer, drawbuffer, value); + } + + static void CODEGEN_FUNCPTR Switch_ClearBufferuiv(GLenum buffer, GLint drawbuffer, const GLuint *value) + { + ClearBufferuiv = (PFNCLEARBUFFERUIVPROC)IntGetProcAddress("glClearBufferuiv"); + ClearBufferuiv(buffer, drawbuffer, value); + } + + static void CODEGEN_FUNCPTR Switch_ClearBufferfv(GLenum buffer, GLint drawbuffer, const GLfloat *value) + { + ClearBufferfv = (PFNCLEARBUFFERFVPROC)IntGetProcAddress("glClearBufferfv"); + ClearBufferfv(buffer, drawbuffer, value); + } + + static void CODEGEN_FUNCPTR Switch_ClearBufferfi(GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil) + { + ClearBufferfi = (PFNCLEARBUFFERFIPROC)IntGetProcAddress("glClearBufferfi"); + ClearBufferfi(buffer, drawbuffer, depth, stencil); + } + + static const GLubyte * CODEGEN_FUNCPTR Switch_GetStringi(GLenum name, GLuint index) + { + GetStringi = (PFNGETSTRINGIPROC)IntGetProcAddress("glGetStringi"); + return GetStringi(name, index); + } + + // Extension: ARB_uniform_buffer_object + + static void CODEGEN_FUNCPTR Switch_GetUniformIndices(GLuint program, GLsizei uniformCount, const GLchar* const *uniformNames, GLuint *uniformIndices) + { + GetUniformIndices = (PFNGETUNIFORMINDICESPROC)IntGetProcAddress("glGetUniformIndices"); + GetUniformIndices(program, uniformCount, uniformNames, uniformIndices); + } + + static void CODEGEN_FUNCPTR Switch_GetActiveUniformsiv(GLuint program, GLsizei uniformCount, const GLuint *uniformIndices, GLenum pname, GLint *params) + { + GetActiveUniformsiv = (PFNGETACTIVEUNIFORMSIVPROC)IntGetProcAddress("glGetActiveUniformsiv"); + GetActiveUniformsiv(program, uniformCount, uniformIndices, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetActiveUniformName(GLuint program, GLuint uniformIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformName) + { + GetActiveUniformName = (PFNGETACTIVEUNIFORMNAMEPROC)IntGetProcAddress("glGetActiveUniformName"); + GetActiveUniformName(program, uniformIndex, bufSize, length, uniformName); + } + + static GLuint CODEGEN_FUNCPTR Switch_GetUniformBlockIndex(GLuint program, const GLchar *uniformBlockName) + { + GetUniformBlockIndex = (PFNGETUNIFORMBLOCKINDEXPROC)IntGetProcAddress("glGetUniformBlockIndex"); + return GetUniformBlockIndex(program, uniformBlockName); + } + + static void CODEGEN_FUNCPTR Switch_GetActiveUniformBlockiv(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params) + { + GetActiveUniformBlockiv = (PFNGETACTIVEUNIFORMBLOCKIVPROC)IntGetProcAddress("glGetActiveUniformBlockiv"); + GetActiveUniformBlockiv(program, uniformBlockIndex, pname, params); + } + + static void CODEGEN_FUNCPTR Switch_GetActiveUniformBlockName(GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformBlockName) + { + GetActiveUniformBlockName = (PFNGETACTIVEUNIFORMBLOCKNAMEPROC)IntGetProcAddress("glGetActiveUniformBlockName"); + GetActiveUniformBlockName(program, uniformBlockIndex, bufSize, length, uniformBlockName); + } + + static void CODEGEN_FUNCPTR Switch_UniformBlockBinding(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding) + { + UniformBlockBinding = (PFNUNIFORMBLOCKBINDINGPROC)IntGetProcAddress("glUniformBlockBinding"); + UniformBlockBinding(program, uniformBlockIndex, uniformBlockBinding); + } + + // Extension: ARB_copy_buffer + + static void CODEGEN_FUNCPTR Switch_CopyBufferSubData(GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size) + { + CopyBufferSubData = (PFNCOPYBUFFERSUBDATAPROC)IntGetProcAddress("glCopyBufferSubData"); + CopyBufferSubData(readTarget, writeTarget, readOffset, writeOffset, size); + } + + // Extension: 3.1 + + static void CODEGEN_FUNCPTR Switch_DrawArraysInstanced(GLenum mode, GLint first, GLsizei count, GLsizei instancecount) + { + DrawArraysInstanced = (PFNDRAWARRAYSINSTANCEDPROC)IntGetProcAddress("glDrawArraysInstanced"); + DrawArraysInstanced(mode, first, count, instancecount); + } + + static void CODEGEN_FUNCPTR Switch_DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type, const GLvoid *indices, GLsizei instancecount) + { + DrawElementsInstanced = (PFNDRAWELEMENTSINSTANCEDPROC)IntGetProcAddress("glDrawElementsInstanced"); + DrawElementsInstanced(mode, count, type, indices, instancecount); + } + + static void CODEGEN_FUNCPTR Switch_TexBuffer(GLenum target, GLenum internalformat, GLuint buffer) + { + TexBuffer = (PFNTEXBUFFERPROC)IntGetProcAddress("glTexBuffer"); + TexBuffer(target, internalformat, buffer); + } + + static void CODEGEN_FUNCPTR Switch_PrimitiveRestartIndex(GLuint index) + { + PrimitiveRestartIndex = (PFNPRIMITIVERESTARTINDEXPROC)IntGetProcAddress("glPrimitiveRestartIndex"); + PrimitiveRestartIndex(index); + } + + // Legacy + + static void CODEGEN_FUNCPTR Switch_EnableClientState(GLenum cap) + { + EnableClientState = (PFNENABLECLIENTSTATEPROC)IntGetProcAddress("glEnableClientState"); + EnableClientState(cap); + } + + static void CODEGEN_FUNCPTR Switch_DisableClientState(GLenum cap) + { + DisableClientState = (PFNDISABLECLIENTSTATEPROC)IntGetProcAddress("glDisableClientState"); + DisableClientState(cap); + } + + static void CODEGEN_FUNCPTR Switch_VertexPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr) + { + VertexPointer = (PFNVERTEXPOINTERPROC)IntGetProcAddress("glVertexPointer"); + VertexPointer(size, type, stride, ptr); + } + + static void CODEGEN_FUNCPTR Switch_NormalPointer(GLenum type, GLsizei stride, const GLvoid *ptr) + { + NormalPointer = (PFNNORMALPOINTERPROC)IntGetProcAddress("glNormalPointer"); + NormalPointer(type, stride, ptr); + } + + static void CODEGEN_FUNCPTR Switch_ColorPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr) + { + ColorPointer = (PFNCOLORPOINTERPROC)IntGetProcAddress("glColorPointer"); + ColorPointer(size, type, stride, ptr); + } + + static void CODEGEN_FUNCPTR Switch_TexCoordPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr) + { + TexCoordPointer = (PFNTEXCOORDPOINTERPROC)IntGetProcAddress("glTexCoordPointer"); + TexCoordPointer(size, type, stride, ptr); + } + + static void CODEGEN_FUNCPTR Switch_TexEnvi(GLenum target, GLenum pname, GLint param) + { + TexEnvi = (PFNTEXENVIPROC)IntGetProcAddress("glTexEnvi"); + TexEnvi(target, pname, param); + } + + static void CODEGEN_FUNCPTR Switch_MatrixMode(GLenum mode) + { + MatrixMode = (PFNMATRIXMODEPROC)IntGetProcAddress("glMatrixMode"); + MatrixMode(mode); + } + + static void CODEGEN_FUNCPTR Switch_LoadIdentity(void) + { + LoadIdentity = (PFNLOADIDENTITYPROC)IntGetProcAddress("glLoadIdentity"); + LoadIdentity(); + } + + static void CODEGEN_FUNCPTR Switch_Ortho(GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble near_val, GLdouble far_val) + { + Ortho = (PFNORTHOPROC)IntGetProcAddress("glOrtho"); + Ortho(left, right, bottom, top, near_val, far_val); + } + + static void CODEGEN_FUNCPTR Switch_Color3d(GLdouble red, GLdouble green, GLdouble blue) + { + Color3d = (PFNCOLOR3DPROC)IntGetProcAddress("glColor3d"); + Color3d(red, green, blue); + } + + struct InitializeVariables + { + InitializeVariables() + { + // Extension: 1.1 + CullFace = Switch_CullFace; + FrontFace = Switch_FrontFace; + Hint = Switch_Hint; + LineWidth = Switch_LineWidth; + PointSize = Switch_PointSize; + PolygonMode = Switch_PolygonMode; + Scissor = Switch_Scissor; + TexParameterf = Switch_TexParameterf; + TexParameterfv = Switch_TexParameterfv; + TexParameteri = Switch_TexParameteri; + TexParameteriv = Switch_TexParameteriv; + TexImage1D = Switch_TexImage1D; + TexImage2D = Switch_TexImage2D; + DrawBuffer = Switch_DrawBuffer; + Clear = Switch_Clear; + ClearColor = Switch_ClearColor; + ClearStencil = Switch_ClearStencil; + ClearDepth = Switch_ClearDepth; + StencilMask = Switch_StencilMask; + ColorMask = Switch_ColorMask; + DepthMask = Switch_DepthMask; + Disable = Switch_Disable; + Enable = Switch_Enable; + Finish = Switch_Finish; + Flush = Switch_Flush; + BlendFunc = Switch_BlendFunc; + LogicOp = Switch_LogicOp; + StencilFunc = Switch_StencilFunc; + StencilOp = Switch_StencilOp; + DepthFunc = Switch_DepthFunc; + PixelStoref = Switch_PixelStoref; + PixelStorei = Switch_PixelStorei; + ReadBuffer = Switch_ReadBuffer; + ReadPixels = Switch_ReadPixels; + GetBooleanv = Switch_GetBooleanv; + GetDoublev = Switch_GetDoublev; + GetError = Switch_GetError; + GetFloatv = Switch_GetFloatv; + GetIntegerv = Switch_GetIntegerv; + GetString = Switch_GetString; + GetTexImage = Switch_GetTexImage; + GetTexParameterfv = Switch_GetTexParameterfv; + GetTexParameteriv = Switch_GetTexParameteriv; + GetTexLevelParameterfv = Switch_GetTexLevelParameterfv; + GetTexLevelParameteriv = Switch_GetTexLevelParameteriv; + IsEnabled = Switch_IsEnabled; + DepthRange = Switch_DepthRange; + Viewport = Switch_Viewport; + DrawArrays = Switch_DrawArrays; + DrawElements = Switch_DrawElements; + GetPointerv = Switch_GetPointerv; + PolygonOffset = Switch_PolygonOffset; + CopyTexImage1D = Switch_CopyTexImage1D; + CopyTexImage2D = Switch_CopyTexImage2D; + CopyTexSubImage1D = Switch_CopyTexSubImage1D; + CopyTexSubImage2D = Switch_CopyTexSubImage2D; + TexSubImage1D = Switch_TexSubImage1D; + TexSubImage2D = Switch_TexSubImage2D; + BindTexture = Switch_BindTexture; + DeleteTextures = Switch_DeleteTextures; + GenTextures = Switch_GenTextures; + IsTexture = Switch_IsTexture; + Indexub = Switch_Indexub; + Indexubv = Switch_Indexubv; + + // Extension: 1.2 + BlendColor = Switch_BlendColor; + BlendEquation = Switch_BlendEquation; + DrawRangeElements = Switch_DrawRangeElements; + TexSubImage3D = Switch_TexSubImage3D; + CopyTexSubImage3D = Switch_CopyTexSubImage3D; + + // Extension: 1.3 + ActiveTexture = Switch_ActiveTexture; + SampleCoverage = Switch_SampleCoverage; + CompressedTexImage3D = Switch_CompressedTexImage3D; + CompressedTexImage2D = Switch_CompressedTexImage2D; + CompressedTexImage1D = Switch_CompressedTexImage1D; + CompressedTexSubImage3D = Switch_CompressedTexSubImage3D; + CompressedTexSubImage2D = Switch_CompressedTexSubImage2D; + CompressedTexSubImage1D = Switch_CompressedTexSubImage1D; + GetCompressedTexImage = Switch_GetCompressedTexImage; + + // Extension: 1.4 + BlendFuncSeparate = Switch_BlendFuncSeparate; + MultiDrawArrays = Switch_MultiDrawArrays; + MultiDrawElements = Switch_MultiDrawElements; + PointParameterf = Switch_PointParameterf; + PointParameterfv = Switch_PointParameterfv; + PointParameteri = Switch_PointParameteri; + PointParameteriv = Switch_PointParameteriv; + + // Extension: 1.5 + GenQueries = Switch_GenQueries; + DeleteQueries = Switch_DeleteQueries; + IsQuery = Switch_IsQuery; + BeginQuery = Switch_BeginQuery; + EndQuery = Switch_EndQuery; + GetQueryiv = Switch_GetQueryiv; + GetQueryObjectiv = Switch_GetQueryObjectiv; + GetQueryObjectuiv = Switch_GetQueryObjectuiv; + BindBuffer = Switch_BindBuffer; + DeleteBuffers = Switch_DeleteBuffers; + GenBuffers = Switch_GenBuffers; + IsBuffer = Switch_IsBuffer; + BufferData = Switch_BufferData; + BufferSubData = Switch_BufferSubData; + GetBufferSubData = Switch_GetBufferSubData; + MapBuffer = Switch_MapBuffer; + UnmapBuffer = Switch_UnmapBuffer; + GetBufferParameteriv = Switch_GetBufferParameteriv; + GetBufferPointerv = Switch_GetBufferPointerv; + + // Extension: 2.0 + BlendEquationSeparate = Switch_BlendEquationSeparate; + DrawBuffers = Switch_DrawBuffers; + StencilOpSeparate = Switch_StencilOpSeparate; + StencilFuncSeparate = Switch_StencilFuncSeparate; + StencilMaskSeparate = Switch_StencilMaskSeparate; + AttachShader = Switch_AttachShader; + BindAttribLocation = Switch_BindAttribLocation; + CompileShader = Switch_CompileShader; + CreateProgram = Switch_CreateProgram; + CreateShader = Switch_CreateShader; + DeleteProgram = Switch_DeleteProgram; + DeleteShader = Switch_DeleteShader; + DetachShader = Switch_DetachShader; + DisableVertexAttribArray = Switch_DisableVertexAttribArray; + EnableVertexAttribArray = Switch_EnableVertexAttribArray; + GetActiveAttrib = Switch_GetActiveAttrib; + GetActiveUniform = Switch_GetActiveUniform; + GetAttachedShaders = Switch_GetAttachedShaders; + GetAttribLocation = Switch_GetAttribLocation; + GetProgramiv = Switch_GetProgramiv; + GetProgramInfoLog = Switch_GetProgramInfoLog; + GetShaderiv = Switch_GetShaderiv; + GetShaderInfoLog = Switch_GetShaderInfoLog; + GetShaderSource = Switch_GetShaderSource; + GetUniformLocation = Switch_GetUniformLocation; + GetUniformfv = Switch_GetUniformfv; + GetUniformiv = Switch_GetUniformiv; + GetVertexAttribdv = Switch_GetVertexAttribdv; + GetVertexAttribfv = Switch_GetVertexAttribfv; + GetVertexAttribiv = Switch_GetVertexAttribiv; + GetVertexAttribPointerv = Switch_GetVertexAttribPointerv; + IsProgram = Switch_IsProgram; + IsShader = Switch_IsShader; + LinkProgram = Switch_LinkProgram; + ShaderSource = Switch_ShaderSource; + UseProgram = Switch_UseProgram; + Uniform1f = Switch_Uniform1f; + Uniform2f = Switch_Uniform2f; + Uniform3f = Switch_Uniform3f; + Uniform4f = Switch_Uniform4f; + Uniform1i = Switch_Uniform1i; + Uniform2i = Switch_Uniform2i; + Uniform3i = Switch_Uniform3i; + Uniform4i = Switch_Uniform4i; + Uniform1fv = Switch_Uniform1fv; + Uniform2fv = Switch_Uniform2fv; + Uniform3fv = Switch_Uniform3fv; + Uniform4fv = Switch_Uniform4fv; + Uniform1iv = Switch_Uniform1iv; + Uniform2iv = Switch_Uniform2iv; + Uniform3iv = Switch_Uniform3iv; + Uniform4iv = Switch_Uniform4iv; + UniformMatrix2fv = Switch_UniformMatrix2fv; + UniformMatrix3fv = Switch_UniformMatrix3fv; + UniformMatrix4fv = Switch_UniformMatrix4fv; + ValidateProgram = Switch_ValidateProgram; + VertexAttribPointer = Switch_VertexAttribPointer; + + // Extension: 2.1 + UniformMatrix2x3fv = Switch_UniformMatrix2x3fv; + UniformMatrix3x2fv = Switch_UniformMatrix3x2fv; + UniformMatrix2x4fv = Switch_UniformMatrix2x4fv; + UniformMatrix4x2fv = Switch_UniformMatrix4x2fv; + UniformMatrix3x4fv = Switch_UniformMatrix3x4fv; + UniformMatrix4x3fv = Switch_UniformMatrix4x3fv; + + // Extension: ARB_vertex_array_object + BindVertexArray = Switch_BindVertexArray; + DeleteVertexArrays = Switch_DeleteVertexArrays; + GenVertexArrays = Switch_GenVertexArrays; + IsVertexArray = Switch_IsVertexArray; + + // Extension: ARB_map_buffer_range + MapBufferRange = Switch_MapBufferRange; + FlushMappedBufferRange = Switch_FlushMappedBufferRange; + + // Extension: ARB_framebuffer_object + IsRenderbuffer = Switch_IsRenderbuffer; + BindRenderbuffer = Switch_BindRenderbuffer; + DeleteRenderbuffers = Switch_DeleteRenderbuffers; + GenRenderbuffers = Switch_GenRenderbuffers; + RenderbufferStorage = Switch_RenderbufferStorage; + GetRenderbufferParameteriv = Switch_GetRenderbufferParameteriv; + IsFramebuffer = Switch_IsFramebuffer; + BindFramebuffer = Switch_BindFramebuffer; + DeleteFramebuffers = Switch_DeleteFramebuffers; + GenFramebuffers = Switch_GenFramebuffers; + CheckFramebufferStatus = Switch_CheckFramebufferStatus; + FramebufferTexture1D = Switch_FramebufferTexture1D; + FramebufferTexture2D = Switch_FramebufferTexture2D; + FramebufferTexture3D = Switch_FramebufferTexture3D; + FramebufferRenderbuffer = Switch_FramebufferRenderbuffer; + GetFramebufferAttachmentParameteriv = Switch_GetFramebufferAttachmentParameteriv; + GenerateMipmap = Switch_GenerateMipmap; + BlitFramebuffer = Switch_BlitFramebuffer; + RenderbufferStorageMultisample = Switch_RenderbufferStorageMultisample; + FramebufferTextureLayer = Switch_FramebufferTextureLayer; + + // Extension: 3.0 + ColorMaski = Switch_ColorMaski; + GetBooleani_v = Switch_GetBooleani_v; + GetIntegeri_v = Switch_GetIntegeri_v; + Enablei = Switch_Enablei; + Disablei = Switch_Disablei; + IsEnabledi = Switch_IsEnabledi; + BeginTransformFeedback = Switch_BeginTransformFeedback; + EndTransformFeedback = Switch_EndTransformFeedback; + BindBufferRange = Switch_BindBufferRange; + BindBufferBase = Switch_BindBufferBase; + TransformFeedbackVaryings = Switch_TransformFeedbackVaryings; + GetTransformFeedbackVarying = Switch_GetTransformFeedbackVarying; + ClampColor = Switch_ClampColor; + BeginConditionalRender = Switch_BeginConditionalRender; + EndConditionalRender = Switch_EndConditionalRender; + VertexAttribIPointer = Switch_VertexAttribIPointer; + GetVertexAttribIiv = Switch_GetVertexAttribIiv; + GetVertexAttribIuiv = Switch_GetVertexAttribIuiv; + VertexAttribI1i = Switch_VertexAttribI1i; + VertexAttribI2i = Switch_VertexAttribI2i; + VertexAttribI3i = Switch_VertexAttribI3i; + VertexAttribI4i = Switch_VertexAttribI4i; + VertexAttribI1ui = Switch_VertexAttribI1ui; + VertexAttribI2ui = Switch_VertexAttribI2ui; + VertexAttribI3ui = Switch_VertexAttribI3ui; + VertexAttribI4ui = Switch_VertexAttribI4ui; + VertexAttribI1iv = Switch_VertexAttribI1iv; + VertexAttribI2iv = Switch_VertexAttribI2iv; + VertexAttribI3iv = Switch_VertexAttribI3iv; + VertexAttribI4iv = Switch_VertexAttribI4iv; + VertexAttribI1uiv = Switch_VertexAttribI1uiv; + VertexAttribI2uiv = Switch_VertexAttribI2uiv; + VertexAttribI3uiv = Switch_VertexAttribI3uiv; + VertexAttribI4uiv = Switch_VertexAttribI4uiv; + VertexAttribI4bv = Switch_VertexAttribI4bv; + VertexAttribI4sv = Switch_VertexAttribI4sv; + VertexAttribI4ubv = Switch_VertexAttribI4ubv; + VertexAttribI4usv = Switch_VertexAttribI4usv; + GetUniformuiv = Switch_GetUniformuiv; + BindFragDataLocation = Switch_BindFragDataLocation; + GetFragDataLocation = Switch_GetFragDataLocation; + Uniform1ui = Switch_Uniform1ui; + Uniform2ui = Switch_Uniform2ui; + Uniform3ui = Switch_Uniform3ui; + Uniform4ui = Switch_Uniform4ui; + Uniform1uiv = Switch_Uniform1uiv; + Uniform2uiv = Switch_Uniform2uiv; + Uniform3uiv = Switch_Uniform3uiv; + Uniform4uiv = Switch_Uniform4uiv; + TexParameterIiv = Switch_TexParameterIiv; + TexParameterIuiv = Switch_TexParameterIuiv; + GetTexParameterIiv = Switch_GetTexParameterIiv; + GetTexParameterIuiv = Switch_GetTexParameterIuiv; + ClearBufferiv = Switch_ClearBufferiv; + ClearBufferuiv = Switch_ClearBufferuiv; + ClearBufferfv = Switch_ClearBufferfv; + ClearBufferfi = Switch_ClearBufferfi; + GetStringi = Switch_GetStringi; + + // Extension: ARB_uniform_buffer_object + GetUniformIndices = Switch_GetUniformIndices; + GetActiveUniformsiv = Switch_GetActiveUniformsiv; + GetActiveUniformName = Switch_GetActiveUniformName; + GetUniformBlockIndex = Switch_GetUniformBlockIndex; + GetActiveUniformBlockiv = Switch_GetActiveUniformBlockiv; + GetActiveUniformBlockName = Switch_GetActiveUniformBlockName; + UniformBlockBinding = Switch_UniformBlockBinding; + + // Extension: ARB_copy_buffer + CopyBufferSubData = Switch_CopyBufferSubData; + + // Extension: 3.1 + DrawArraysInstanced = Switch_DrawArraysInstanced; + DrawElementsInstanced = Switch_DrawElementsInstanced; + TexBuffer = Switch_TexBuffer; + PrimitiveRestartIndex = Switch_PrimitiveRestartIndex; + + // Legacy + EnableClientState = Switch_EnableClientState; + DisableClientState = Switch_DisableClientState; + VertexPointer = Switch_VertexPointer; + NormalPointer = Switch_NormalPointer; + ColorPointer = Switch_ColorPointer; + TexCoordPointer = Switch_TexCoordPointer; + TexEnvi = Switch_TexEnvi; + MatrixMode = Switch_MatrixMode; + LoadIdentity = Switch_LoadIdentity; + Ortho = Switch_Ortho; + Color3d = Switch_Color3d; + } + }; + + InitializeVariables g_initVariables; +} diff --git a/modules/core/src/gl_core_3_1.hpp b/modules/core/src/gl_core_3_1.hpp new file mode 100644 index 0000000000..50dbee66c7 --- /dev/null +++ b/modules/core/src/gl_core_3_1.hpp @@ -0,0 +1,1331 @@ +#ifndef OPENGL_NOLOAD_STYLE_HPP +#define OPENGL_NOLOAD_STYLE_HPP + +#if defined(__gl_h_) || defined(__GL_H__) +#error Attempt to include auto-generated header after including gl.h +#endif +#if defined(__glext_h_) || defined(__GLEXT_H_) +#error Attempt to include auto-generated header after including glext.h +#endif +#if defined(__gl_ATI_h_) +#error Attempt to include auto-generated header after including glATI.h +#endif + +#define __gl_h_ +#define __GL_H__ +#define __glext_h_ +#define __GLEXT_H_ +#define __gl_ATI_h_ + +#ifndef APIENTRY + #if defined(__MINGW32__) + #ifndef WIN32_LEAN_AND_MEAN + #define WIN32_LEAN_AND_MEAN 1 + #endif + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #elif (defined(_MSC_VER) && _MSC_VER >= 800) || defined(_STDCALL_SUPPORTED) || defined(__BORLANDC__) + #ifndef WIN32_LEAN_AND_MEAN + #define WIN32_LEAN_AND_MEAN 1 + #endif + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #else + #define APIENTRY + #endif +#endif // APIENTRY + +#ifndef CODEGEN_FUNCPTR + #define CODEGEN_REMOVE_FUNCPTR + #if defined(_WIN32) + #define CODEGEN_FUNCPTR APIENTRY + #else + #define CODEGEN_FUNCPTR + #endif +#endif // CODEGEN_FUNCPTR + +#ifndef GL_LOAD_GEN_BASIC_OPENGL_TYPEDEFS +#define GL_LOAD_GEN_BASIC_OPENGL_TYPEDEFS + typedef unsigned int GLenum; + typedef unsigned char GLboolean; + typedef unsigned int GLbitfield; + typedef signed char GLbyte; + typedef short GLshort; + typedef int GLint; + typedef int GLsizei; + typedef unsigned char GLubyte; + typedef unsigned short GLushort; + typedef unsigned int GLuint; + typedef float GLfloat; + typedef float GLclampf; + typedef double GLdouble; + typedef double GLclampd; + #define GLvoid void +#endif // GL_LOAD_GEN_BASIC_OPENGL_TYPEDEFS + +#include + +#ifndef GL_VERSION_2_0 + // GL type for program/shader text + typedef char GLchar; +#endif + +#ifndef GL_VERSION_1_5 + // GL types for handling large vertex buffer objects + typedef ptrdiff_t GLintptr; + typedef ptrdiff_t GLsizeiptr; +#endif + +#ifndef GL_ARB_vertex_buffer_object + // GL types for handling large vertex buffer objects + typedef ptrdiff_t GLintptrARB; + typedef ptrdiff_t GLsizeiptrARB; +#endif + +#ifndef GL_ARB_shader_objects + // GL types for program/shader text and shader object handles + typedef char GLcharARB; + typedef unsigned int GLhandleARB; +#endif + +// GL type for "half" precision (s10e5) float data in host memory +#ifndef GL_ARB_half_float_pixel + typedef unsigned short GLhalfARB; +#endif +#ifndef GL_NV_half_float + typedef unsigned short GLhalfNV; +#endif + +#ifndef GLEXT_64_TYPES_DEFINED + // This code block is duplicated in glxext.h, so must be protected + #define GLEXT_64_TYPES_DEFINED + + // Define int32_t, int64_t, and uint64_t types for UST/MSC + // (as used in the GL_EXT_timer_query extension) + #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #include + #elif defined(__sun__) || defined(__digital__) + #include + #if defined(__STDC__) + #if defined(__arch64__) || defined(_LP64) + typedef long int int64_t; + typedef unsigned long int uint64_t; + #else + typedef long long int int64_t; + typedef unsigned long long int uint64_t; + #endif // __arch64__ + #endif // __STDC__ + #elif defined( __VMS ) || defined(__sgi) + #include + #elif defined(__SCO__) || defined(__USLC__) + #include + #elif defined(__UNIXOS2__) || defined(__SOL64__) + typedef long int int32_t; + typedef long long int int64_t; + typedef unsigned long long int uint64_t; + #elif defined(_WIN32) && defined(__GNUC__) + #include + #elif defined(_WIN32) + typedef __int32 int32_t; + typedef __int64 int64_t; + typedef unsigned __int64 uint64_t; + #else + // Fallback if nothing above works + #include + #endif +#endif + +#ifndef GL_EXT_timer_query + typedef int64_t GLint64EXT; + typedef uint64_t GLuint64EXT; +#endif + +#ifndef GL_ARB_sync + typedef int64_t GLint64; + typedef uint64_t GLuint64; + typedef struct __GLsync *GLsync; +#endif + +#ifndef GL_ARB_cl_event + // These incomplete types let us declare types compatible with OpenCL's cl_context and cl_event + struct _cl_context; + struct _cl_event; +#endif + +#ifndef GL_ARB_debug_output + typedef void (APIENTRY *GLDEBUGPROCARB)(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,GLvoid *userParam); +#endif + +#ifndef GL_AMD_debug_output + typedef void (APIENTRY *GLDEBUGPROCAMD)(GLuint id,GLenum category,GLenum severity,GLsizei length,const GLchar *message,GLvoid *userParam); +#endif + +#ifndef GL_KHR_debug + typedef void (APIENTRY *GLDEBUGPROC)(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,GLvoid *userParam); +#endif + +#ifndef GL_NV_vdpau_interop + typedef GLintptr GLvdpauSurfaceNV; +#endif + +namespace gl +{ + enum + { + // Version: 1.1 + DEPTH_BUFFER_BIT = 0x00000100, + STENCIL_BUFFER_BIT = 0x00000400, + COLOR_BUFFER_BIT = 0x00004000, + FALSE_ = 0, + TRUE_ = 1, + POINTS = 0x0000, + LINES = 0x0001, + LINE_LOOP = 0x0002, + LINE_STRIP = 0x0003, + TRIANGLES = 0x0004, + TRIANGLE_STRIP = 0x0005, + TRIANGLE_FAN = 0x0006, + QUADS = 0x0007, + NEVER = 0x0200, + LESS = 0x0201, + EQUAL = 0x0202, + LEQUAL = 0x0203, + GREATER = 0x0204, + NOTEQUAL = 0x0205, + GEQUAL = 0x0206, + ALWAYS = 0x0207, + ZERO = 0, + ONE = 1, + SRC_COLOR = 0x0300, + ONE_MINUS_SRC_COLOR = 0x0301, + SRC_ALPHA = 0x0302, + ONE_MINUS_SRC_ALPHA = 0x0303, + DST_ALPHA = 0x0304, + ONE_MINUS_DST_ALPHA = 0x0305, + DST_COLOR = 0x0306, + ONE_MINUS_DST_COLOR = 0x0307, + SRC_ALPHA_SATURATE = 0x0308, + NONE = 0, + FRONT_LEFT = 0x0400, + FRONT_RIGHT = 0x0401, + BACK_LEFT = 0x0402, + BACK_RIGHT = 0x0403, + FRONT = 0x0404, + BACK = 0x0405, + LEFT = 0x0406, + RIGHT = 0x0407, + FRONT_AND_BACK = 0x0408, + NO_ERROR_ = 0, + INVALID_ENUM = 0x0500, + INVALID_VALUE = 0x0501, + INVALID_OPERATION = 0x0502, + OUT_OF_MEMORY = 0x0505, + CW = 0x0900, + CCW = 0x0901, + POINT_SIZE = 0x0B11, + POINT_SIZE_RANGE = 0x0B12, + POINT_SIZE_GRANULARITY = 0x0B13, + LINE_SMOOTH = 0x0B20, + LINE_WIDTH = 0x0B21, + LINE_WIDTH_RANGE = 0x0B22, + LINE_WIDTH_GRANULARITY = 0x0B23, + POLYGON_MODE = 0x0B40, + POLYGON_SMOOTH = 0x0B41, + CULL_FACE = 0x0B44, + CULL_FACE_MODE = 0x0B45, + FRONT_FACE = 0x0B46, + DEPTH_RANGE = 0x0B70, + DEPTH_TEST = 0x0B71, + DEPTH_WRITEMASK = 0x0B72, + DEPTH_CLEAR_VALUE = 0x0B73, + DEPTH_FUNC = 0x0B74, + STENCIL_TEST = 0x0B90, + STENCIL_CLEAR_VALUE = 0x0B91, + STENCIL_FUNC = 0x0B92, + STENCIL_VALUE_MASK = 0x0B93, + STENCIL_FAIL = 0x0B94, + STENCIL_PASS_DEPTH_FAIL = 0x0B95, + STENCIL_PASS_DEPTH_PASS = 0x0B96, + STENCIL_REF = 0x0B97, + STENCIL_WRITEMASK = 0x0B98, + VIEWPORT = 0x0BA2, + DITHER = 0x0BD0, + BLEND_DST = 0x0BE0, + BLEND_SRC = 0x0BE1, + BLEND = 0x0BE2, + LOGIC_OP_MODE = 0x0BF0, + COLOR_LOGIC_OP = 0x0BF2, + DRAW_BUFFER = 0x0C01, + READ_BUFFER = 0x0C02, + SCISSOR_BOX = 0x0C10, + SCISSOR_TEST = 0x0C11, + COLOR_CLEAR_VALUE = 0x0C22, + COLOR_WRITEMASK = 0x0C23, + DOUBLEBUFFER = 0x0C32, + STEREO = 0x0C33, + LINE_SMOOTH_HINT = 0x0C52, + POLYGON_SMOOTH_HINT = 0x0C53, + UNPACK_SWAP_BYTES = 0x0CF0, + UNPACK_LSB_FIRST = 0x0CF1, + UNPACK_ROW_LENGTH = 0x0CF2, + UNPACK_SKIP_ROWS = 0x0CF3, + UNPACK_SKIP_PIXELS = 0x0CF4, + UNPACK_ALIGNMENT = 0x0CF5, + PACK_SWAP_BYTES = 0x0D00, + PACK_LSB_FIRST = 0x0D01, + PACK_ROW_LENGTH = 0x0D02, + PACK_SKIP_ROWS = 0x0D03, + PACK_SKIP_PIXELS = 0x0D04, + PACK_ALIGNMENT = 0x0D05, + MAX_TEXTURE_SIZE = 0x0D33, + MAX_VIEWPORT_DIMS = 0x0D3A, + SUBPIXEL_BITS = 0x0D50, + TEXTURE_1D = 0x0DE0, + TEXTURE_2D = 0x0DE1, + POLYGON_OFFSET_UNITS = 0x2A00, + POLYGON_OFFSET_POINT = 0x2A01, + POLYGON_OFFSET_LINE = 0x2A02, + POLYGON_OFFSET_FILL = 0x8037, + POLYGON_OFFSET_FACTOR = 0x8038, + TEXTURE_BINDING_1D = 0x8068, + TEXTURE_BINDING_2D = 0x8069, + TEXTURE_WIDTH = 0x1000, + TEXTURE_HEIGHT = 0x1001, + TEXTURE_INTERNAL_FORMAT = 0x1003, + TEXTURE_BORDER_COLOR = 0x1004, + TEXTURE_RED_SIZE = 0x805C, + TEXTURE_GREEN_SIZE = 0x805D, + TEXTURE_BLUE_SIZE = 0x805E, + TEXTURE_ALPHA_SIZE = 0x805F, + DONT_CARE = 0x1100, + FASTEST = 0x1101, + NICEST = 0x1102, + BYTE = 0x1400, + UNSIGNED_BYTE = 0x1401, + SHORT = 0x1402, + UNSIGNED_SHORT = 0x1403, + INT = 0x1404, + UNSIGNED_INT = 0x1405, + FLOAT = 0x1406, + DOUBLE = 0x140A, + CLEAR = 0x1500, + AND = 0x1501, + AND_REVERSE = 0x1502, + COPY = 0x1503, + AND_INVERTED = 0x1504, + NOOP = 0x1505, + XOR = 0x1506, + OR = 0x1507, + NOR = 0x1508, + EQUIV = 0x1509, + INVERT = 0x150A, + OR_REVERSE = 0x150B, + COPY_INVERTED = 0x150C, + OR_INVERTED = 0x150D, + NAND = 0x150E, + SET = 0x150F, + TEXTURE = 0x1702, + COLOR = 0x1800, + DEPTH = 0x1801, + STENCIL = 0x1802, + STENCIL_INDEX = 0x1901, + DEPTH_COMPONENT = 0x1902, + RED = 0x1903, + GREEN = 0x1904, + BLUE = 0x1905, + ALPHA = 0x1906, + RGB = 0x1907, + RGBA = 0x1908, + POINT = 0x1B00, + LINE = 0x1B01, + FILL = 0x1B02, + KEEP = 0x1E00, + REPLACE = 0x1E01, + INCR = 0x1E02, + DECR = 0x1E03, + VENDOR = 0x1F00, + RENDERER = 0x1F01, + VERSION_ = 0x1F02, + EXTENSIONS = 0x1F03, + NEAREST = 0x2600, + LINEAR = 0x2601, + NEAREST_MIPMAP_NEAREST = 0x2700, + LINEAR_MIPMAP_NEAREST = 0x2701, + NEAREST_MIPMAP_LINEAR = 0x2702, + LINEAR_MIPMAP_LINEAR = 0x2703, + TEXTURE_MAG_FILTER = 0x2800, + TEXTURE_MIN_FILTER = 0x2801, + TEXTURE_WRAP_S = 0x2802, + TEXTURE_WRAP_T = 0x2803, + PROXY_TEXTURE_1D = 0x8063, + PROXY_TEXTURE_2D = 0x8064, + REPEAT = 0x2901, + R3_G3_B2 = 0x2A10, + RGB4 = 0x804F, + RGB5 = 0x8050, + RGB8 = 0x8051, + RGB10 = 0x8052, + RGB12 = 0x8053, + RGB16 = 0x8054, + RGBA2 = 0x8055, + RGBA4 = 0x8056, + RGB5_A1 = 0x8057, + RGBA8 = 0x8058, + RGB10_A2 = 0x8059, + RGBA12 = 0x805A, + RGBA16 = 0x805B, + + // Core Extension: ARB_imaging + CONSTANT_COLOR = 0x8001, + ONE_MINUS_CONSTANT_COLOR = 0x8002, + CONSTANT_ALPHA = 0x8003, + ONE_MINUS_CONSTANT_ALPHA = 0x8004, + BLEND_COLOR = 0x8005, + FUNC_ADD = 0x8006, + MIN = 0x8007, + MAX = 0x8008, + BLEND_EQUATION = 0x8009, + FUNC_SUBTRACT = 0x800A, + FUNC_REVERSE_SUBTRACT = 0x800B, + CONVOLUTION_1D = 0x8010, + CONVOLUTION_2D = 0x8011, + SEPARABLE_2D = 0x8012, + CONVOLUTION_BORDER_MODE = 0x8013, + CONVOLUTION_FILTER_SCALE = 0x8014, + CONVOLUTION_FILTER_BIAS = 0x8015, + REDUCE = 0x8016, + CONVOLUTION_FORMAT = 0x8017, + CONVOLUTION_WIDTH = 0x8018, + CONVOLUTION_HEIGHT = 0x8019, + MAX_CONVOLUTION_WIDTH = 0x801A, + MAX_CONVOLUTION_HEIGHT = 0x801B, + POST_CONVOLUTION_RED_SCALE = 0x801C, + POST_CONVOLUTION_GREEN_SCALE = 0x801D, + POST_CONVOLUTION_BLUE_SCALE = 0x801E, + POST_CONVOLUTION_ALPHA_SCALE = 0x801F, + POST_CONVOLUTION_RED_BIAS = 0x8020, + POST_CONVOLUTION_GREEN_BIAS = 0x8021, + POST_CONVOLUTION_BLUE_BIAS = 0x8022, + POST_CONVOLUTION_ALPHA_BIAS = 0x8023, + HISTOGRAM = 0x8024, + PROXY_HISTOGRAM = 0x8025, + HISTOGRAM_WIDTH = 0x8026, + HISTOGRAM_FORMAT = 0x8027, + HISTOGRAM_RED_SIZE = 0x8028, + HISTOGRAM_GREEN_SIZE = 0x8029, + HISTOGRAM_BLUE_SIZE = 0x802A, + HISTOGRAM_ALPHA_SIZE = 0x802B, + HISTOGRAM_LUMINANCE_SIZE = 0x802C, + HISTOGRAM_SINK = 0x802D, + MINMAX = 0x802E, + MINMAX_FORMAT = 0x802F, + MINMAX_SINK = 0x8030, + TABLE_TOO_LARGE = 0x8031, + COLOR_MATRIX = 0x80B1, + COLOR_MATRIX_STACK_DEPTH = 0x80B2, + MAX_COLOR_MATRIX_STACK_DEPTH = 0x80B3, + POST_COLOR_MATRIX_RED_SCALE = 0x80B4, + POST_COLOR_MATRIX_GREEN_SCALE = 0x80B5, + POST_COLOR_MATRIX_BLUE_SCALE = 0x80B6, + POST_COLOR_MATRIX_ALPHA_SCALE = 0x80B7, + POST_COLOR_MATRIX_RED_BIAS = 0x80B8, + POST_COLOR_MATRIX_GREEN_BIAS = 0x80B9, + POST_COLOR_MATRIX_BLUE_BIAS = 0x80BA, + POST_COLOR_MATRIX_ALPHA_BIAS = 0x80BB, + COLOR_TABLE = 0x80D0, + POST_CONVOLUTION_COLOR_TABLE = 0x80D1, + POST_COLOR_MATRIX_COLOR_TABLE = 0x80D2, + PROXY_COLOR_TABLE = 0x80D3, + PROXY_POST_CONVOLUTION_COLOR_TABLE = 0x80D4, + PROXY_POST_COLOR_MATRIX_COLOR_TABLE = 0x80D5, + COLOR_TABLE_SCALE = 0x80D6, + COLOR_TABLE_BIAS = 0x80D7, + COLOR_TABLE_FORMAT = 0x80D8, + COLOR_TABLE_WIDTH = 0x80D9, + COLOR_TABLE_RED_SIZE = 0x80DA, + COLOR_TABLE_GREEN_SIZE = 0x80DB, + COLOR_TABLE_BLUE_SIZE = 0x80DC, + COLOR_TABLE_ALPHA_SIZE = 0x80DD, + COLOR_TABLE_LUMINANCE_SIZE = 0x80DE, + COLOR_TABLE_INTENSITY_SIZE = 0x80DF, + CONSTANT_BORDER = 0x8151, + REPLICATE_BORDER = 0x8153, + CONVOLUTION_BORDER_COLOR = 0x8154, + + // Version: 1.2 + UNSIGNED_BYTE_3_3_2 = 0x8032, + UNSIGNED_SHORT_4_4_4_4 = 0x8033, + UNSIGNED_SHORT_5_5_5_1 = 0x8034, + UNSIGNED_INT_8_8_8_8 = 0x8035, + UNSIGNED_INT_10_10_10_2 = 0x8036, + TEXTURE_BINDING_3D = 0x806A, + PACK_SKIP_IMAGES = 0x806B, + PACK_IMAGE_HEIGHT = 0x806C, + UNPACK_SKIP_IMAGES = 0x806D, + UNPACK_IMAGE_HEIGHT = 0x806E, + TEXTURE_3D = 0x806F, + PROXY_TEXTURE_3D = 0x8070, + TEXTURE_DEPTH = 0x8071, + TEXTURE_WRAP_R = 0x8072, + MAX_3D_TEXTURE_SIZE = 0x8073, + UNSIGNED_BYTE_2_3_3_REV = 0x8362, + UNSIGNED_SHORT_5_6_5 = 0x8363, + UNSIGNED_SHORT_5_6_5_REV = 0x8364, + UNSIGNED_SHORT_4_4_4_4_REV = 0x8365, + UNSIGNED_SHORT_1_5_5_5_REV = 0x8366, + UNSIGNED_INT_8_8_8_8_REV = 0x8367, + UNSIGNED_INT_2_10_10_10_REV = 0x8368, + BGR = 0x80E0, + BGRA = 0x80E1, + MAX_ELEMENTS_VERTICES = 0x80E8, + MAX_ELEMENTS_INDICES = 0x80E9, + CLAMP_TO_EDGE = 0x812F, + TEXTURE_MIN_LOD = 0x813A, + TEXTURE_MAX_LOD = 0x813B, + TEXTURE_BASE_LEVEL = 0x813C, + TEXTURE_MAX_LEVEL = 0x813D, + SMOOTH_POINT_SIZE_RANGE = 0x0B12, + SMOOTH_POINT_SIZE_GRANULARITY = 0x0B13, + SMOOTH_LINE_WIDTH_RANGE = 0x0B22, + SMOOTH_LINE_WIDTH_GRANULARITY = 0x0B23, + ALIASED_LINE_WIDTH_RANGE = 0x846E, + + // Version: 1.3 + TEXTURE0 = 0x84C0, + TEXTURE1 = 0x84C1, + TEXTURE2 = 0x84C2, + TEXTURE3 = 0x84C3, + TEXTURE4 = 0x84C4, + TEXTURE5 = 0x84C5, + TEXTURE6 = 0x84C6, + TEXTURE7 = 0x84C7, + TEXTURE8 = 0x84C8, + TEXTURE9 = 0x84C9, + TEXTURE10 = 0x84CA, + TEXTURE11 = 0x84CB, + TEXTURE12 = 0x84CC, + TEXTURE13 = 0x84CD, + TEXTURE14 = 0x84CE, + TEXTURE15 = 0x84CF, + TEXTURE16 = 0x84D0, + TEXTURE17 = 0x84D1, + TEXTURE18 = 0x84D2, + TEXTURE19 = 0x84D3, + TEXTURE20 = 0x84D4, + TEXTURE21 = 0x84D5, + TEXTURE22 = 0x84D6, + TEXTURE23 = 0x84D7, + TEXTURE24 = 0x84D8, + TEXTURE25 = 0x84D9, + TEXTURE26 = 0x84DA, + TEXTURE27 = 0x84DB, + TEXTURE28 = 0x84DC, + TEXTURE29 = 0x84DD, + TEXTURE30 = 0x84DE, + TEXTURE31 = 0x84DF, + ACTIVE_TEXTURE = 0x84E0, + MULTISAMPLE = 0x809D, + SAMPLE_ALPHA_TO_COVERAGE = 0x809E, + SAMPLE_ALPHA_TO_ONE = 0x809F, + SAMPLE_COVERAGE = 0x80A0, + SAMPLE_BUFFERS = 0x80A8, + SAMPLES = 0x80A9, + SAMPLE_COVERAGE_VALUE = 0x80AA, + SAMPLE_COVERAGE_INVERT = 0x80AB, + TEXTURE_CUBE_MAP = 0x8513, + TEXTURE_BINDING_CUBE_MAP = 0x8514, + TEXTURE_CUBE_MAP_POSITIVE_X = 0x8515, + TEXTURE_CUBE_MAP_NEGATIVE_X = 0x8516, + TEXTURE_CUBE_MAP_POSITIVE_Y = 0x8517, + TEXTURE_CUBE_MAP_NEGATIVE_Y = 0x8518, + TEXTURE_CUBE_MAP_POSITIVE_Z = 0x8519, + TEXTURE_CUBE_MAP_NEGATIVE_Z = 0x851A, + PROXY_TEXTURE_CUBE_MAP = 0x851B, + MAX_CUBE_MAP_TEXTURE_SIZE = 0x851C, + COMPRESSED_RGB = 0x84ED, + COMPRESSED_RGBA = 0x84EE, + TEXTURE_COMPRESSION_HINT = 0x84EF, + TEXTURE_COMPRESSED_IMAGE_SIZE = 0x86A0, + TEXTURE_COMPRESSED = 0x86A1, + NUM_COMPRESSED_TEXTURE_FORMATS = 0x86A2, + COMPRESSED_TEXTURE_FORMATS = 0x86A3, + CLAMP_TO_BORDER = 0x812D, + + // Version: 1.4 + BLEND_DST_RGB = 0x80C8, + BLEND_SRC_RGB = 0x80C9, + BLEND_DST_ALPHA = 0x80CA, + BLEND_SRC_ALPHA = 0x80CB, + POINT_FADE_THRESHOLD_SIZE = 0x8128, + DEPTH_COMPONENT16 = 0x81A5, + DEPTH_COMPONENT24 = 0x81A6, + DEPTH_COMPONENT32 = 0x81A7, + MIRRORED_REPEAT = 0x8370, + MAX_TEXTURE_LOD_BIAS = 0x84FD, + TEXTURE_LOD_BIAS = 0x8501, + INCR_WRAP = 0x8507, + DECR_WRAP = 0x8508, + TEXTURE_DEPTH_SIZE = 0x884A, + TEXTURE_COMPARE_MODE = 0x884C, + TEXTURE_COMPARE_FUNC = 0x884D, + + // Version: 1.5 + BUFFER_SIZE = 0x8764, + BUFFER_USAGE = 0x8765, + QUERY_COUNTER_BITS = 0x8864, + CURRENT_QUERY = 0x8865, + QUERY_RESULT = 0x8866, + QUERY_RESULT_AVAILABLE = 0x8867, + ARRAY_BUFFER = 0x8892, + ELEMENT_ARRAY_BUFFER = 0x8893, + ARRAY_BUFFER_BINDING = 0x8894, + ELEMENT_ARRAY_BUFFER_BINDING = 0x8895, + VERTEX_ATTRIB_ARRAY_BUFFER_BINDING = 0x889F, + READ_ONLY = 0x88B8, + WRITE_ONLY = 0x88B9, + READ_WRITE = 0x88BA, + BUFFER_ACCESS = 0x88BB, + BUFFER_MAPPED = 0x88BC, + BUFFER_MAP_POINTER = 0x88BD, + STREAM_DRAW = 0x88E0, + STREAM_READ = 0x88E1, + STREAM_COPY = 0x88E2, + STATIC_DRAW = 0x88E4, + STATIC_READ = 0x88E5, + STATIC_COPY = 0x88E6, + DYNAMIC_DRAW = 0x88E8, + DYNAMIC_READ = 0x88E9, + DYNAMIC_COPY = 0x88EA, + SAMPLES_PASSED = 0x8914, + SRC1_ALPHA = 0x8589, + + // Version: 2.0 + BLEND_EQUATION_RGB = 0x8009, + VERTEX_ATTRIB_ARRAY_ENABLED = 0x8622, + VERTEX_ATTRIB_ARRAY_SIZE = 0x8623, + VERTEX_ATTRIB_ARRAY_STRIDE = 0x8624, + VERTEX_ATTRIB_ARRAY_TYPE = 0x8625, + CURRENT_VERTEX_ATTRIB = 0x8626, + VERTEX_PROGRAM_POINT_SIZE = 0x8642, + VERTEX_ATTRIB_ARRAY_POINTER = 0x8645, + STENCIL_BACK_FUNC = 0x8800, + STENCIL_BACK_FAIL = 0x8801, + STENCIL_BACK_PASS_DEPTH_FAIL = 0x8802, + STENCIL_BACK_PASS_DEPTH_PASS = 0x8803, + MAX_DRAW_BUFFERS = 0x8824, + DRAW_BUFFER0 = 0x8825, + DRAW_BUFFER1 = 0x8826, + DRAW_BUFFER2 = 0x8827, + DRAW_BUFFER3 = 0x8828, + DRAW_BUFFER4 = 0x8829, + DRAW_BUFFER5 = 0x882A, + DRAW_BUFFER6 = 0x882B, + DRAW_BUFFER7 = 0x882C, + DRAW_BUFFER8 = 0x882D, + DRAW_BUFFER9 = 0x882E, + DRAW_BUFFER10 = 0x882F, + DRAW_BUFFER11 = 0x8830, + DRAW_BUFFER12 = 0x8831, + DRAW_BUFFER13 = 0x8832, + DRAW_BUFFER14 = 0x8833, + DRAW_BUFFER15 = 0x8834, + BLEND_EQUATION_ALPHA = 0x883D, + MAX_VERTEX_ATTRIBS = 0x8869, + VERTEX_ATTRIB_ARRAY_NORMALIZED = 0x886A, + MAX_TEXTURE_IMAGE_UNITS = 0x8872, + FRAGMENT_SHADER = 0x8B30, + VERTEX_SHADER = 0x8B31, + MAX_FRAGMENT_UNIFORM_COMPONENTS = 0x8B49, + MAX_VERTEX_UNIFORM_COMPONENTS = 0x8B4A, + MAX_VARYING_FLOATS = 0x8B4B, + MAX_VERTEX_TEXTURE_IMAGE_UNITS = 0x8B4C, + MAX_COMBINED_TEXTURE_IMAGE_UNITS = 0x8B4D, + SHADER_TYPE = 0x8B4F, + FLOAT_VEC2 = 0x8B50, + FLOAT_VEC3 = 0x8B51, + FLOAT_VEC4 = 0x8B52, + INT_VEC2 = 0x8B53, + INT_VEC3 = 0x8B54, + INT_VEC4 = 0x8B55, + BOOL = 0x8B56, + BOOL_VEC2 = 0x8B57, + BOOL_VEC3 = 0x8B58, + BOOL_VEC4 = 0x8B59, + FLOAT_MAT2 = 0x8B5A, + FLOAT_MAT3 = 0x8B5B, + FLOAT_MAT4 = 0x8B5C, + SAMPLER_1D = 0x8B5D, + SAMPLER_2D = 0x8B5E, + SAMPLER_3D = 0x8B5F, + SAMPLER_CUBE = 0x8B60, + SAMPLER_1D_SHADOW = 0x8B61, + SAMPLER_2D_SHADOW = 0x8B62, + DELETE_STATUS = 0x8B80, + COMPILE_STATUS = 0x8B81, + LINK_STATUS = 0x8B82, + VALIDATE_STATUS = 0x8B83, + INFO_LOG_LENGTH = 0x8B84, + ATTACHED_SHADERS = 0x8B85, + ACTIVE_UNIFORMS = 0x8B86, + ACTIVE_UNIFORM_MAX_LENGTH = 0x8B87, + SHADER_SOURCE_LENGTH = 0x8B88, + ACTIVE_ATTRIBUTES = 0x8B89, + ACTIVE_ATTRIBUTE_MAX_LENGTH = 0x8B8A, + FRAGMENT_SHADER_DERIVATIVE_HINT = 0x8B8B, + SHADING_LANGUAGE_VERSION = 0x8B8C, + CURRENT_PROGRAM = 0x8B8D, + POINT_SPRITE_COORD_ORIGIN = 0x8CA0, + LOWER_LEFT = 0x8CA1, + UPPER_LEFT = 0x8CA2, + STENCIL_BACK_REF = 0x8CA3, + STENCIL_BACK_VALUE_MASK = 0x8CA4, + STENCIL_BACK_WRITEMASK = 0x8CA5, + + // Version: 2.1 + PIXEL_PACK_BUFFER = 0x88EB, + PIXEL_UNPACK_BUFFER = 0x88EC, + PIXEL_PACK_BUFFER_BINDING = 0x88ED, + PIXEL_UNPACK_BUFFER_BINDING = 0x88EF, + FLOAT_MAT2x3 = 0x8B65, + FLOAT_MAT2x4 = 0x8B66, + FLOAT_MAT3x2 = 0x8B67, + FLOAT_MAT3x4 = 0x8B68, + FLOAT_MAT4x2 = 0x8B69, + FLOAT_MAT4x3 = 0x8B6A, + SRGB = 0x8C40, + SRGB8 = 0x8C41, + SRGB_ALPHA = 0x8C42, + SRGB8_ALPHA8 = 0x8C43, + COMPRESSED_SRGB = 0x8C48, + COMPRESSED_SRGB_ALPHA = 0x8C49, + + // Core Extension: ARB_vertex_array_object + VERTEX_ARRAY_BINDING = 0x85B5, + + // Core Extension: ARB_texture_rg + RG = 0x8227, + RG_INTEGER = 0x8228, + R8 = 0x8229, + R16 = 0x822A, + RG8 = 0x822B, + RG16 = 0x822C, + R16F = 0x822D, + R32F = 0x822E, + RG16F = 0x822F, + RG32F = 0x8230, + R8I = 0x8231, + R8UI = 0x8232, + R16I = 0x8233, + R16UI = 0x8234, + R32I = 0x8235, + R32UI = 0x8236, + RG8I = 0x8237, + RG8UI = 0x8238, + RG16I = 0x8239, + RG16UI = 0x823A, + RG32I = 0x823B, + RG32UI = 0x823C, + + // Core Extension: ARB_texture_compression_rgtc + COMPRESSED_RED_RGTC1 = 0x8DBB, + COMPRESSED_SIGNED_RED_RGTC1 = 0x8DBC, + COMPRESSED_RG_RGTC2 = 0x8DBD, + COMPRESSED_SIGNED_RG_RGTC2 = 0x8DBE, + + // Core Extension: ARB_map_buffer_range + MAP_READ_BIT = 0x0001, + MAP_WRITE_BIT = 0x0002, + MAP_INVALIDATE_RANGE_BIT = 0x0004, + MAP_INVALIDATE_BUFFER_BIT = 0x0008, + MAP_FLUSH_EXPLICIT_BIT = 0x0010, + MAP_UNSYNCHRONIZED_BIT = 0x0020, + + // Core Extension: ARB_half_float_vertex + HALF_FLOAT = 0x140B, + + // Core Extension: ARB_framebuffer_sRGB + FRAMEBUFFER_SRGB = 0x8DB9, + + // Core Extension: ARB_framebuffer_object + INVALID_FRAMEBUFFER_OPERATION = 0x0506, + FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING = 0x8210, + FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE = 0x8211, + FRAMEBUFFER_ATTACHMENT_RED_SIZE = 0x8212, + FRAMEBUFFER_ATTACHMENT_GREEN_SIZE = 0x8213, + FRAMEBUFFER_ATTACHMENT_BLUE_SIZE = 0x8214, + FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE = 0x8215, + FRAMEBUFFER_ATTACHMENT_DEPTH_SIZE = 0x8216, + FRAMEBUFFER_ATTACHMENT_STENCIL_SIZE = 0x8217, + FRAMEBUFFER_DEFAULT = 0x8218, + FRAMEBUFFER_UNDEFINED = 0x8219, + DEPTH_STENCIL_ATTACHMENT = 0x821A, + INDEX = 0x8222, + MAX_RENDERBUFFER_SIZE = 0x84E8, + DEPTH_STENCIL = 0x84F9, + UNSIGNED_INT_24_8 = 0x84FA, + DEPTH24_STENCIL8 = 0x88F0, + TEXTURE_STENCIL_SIZE = 0x88F1, + TEXTURE_RED_TYPE = 0x8C10, + TEXTURE_GREEN_TYPE = 0x8C11, + TEXTURE_BLUE_TYPE = 0x8C12, + TEXTURE_ALPHA_TYPE = 0x8C13, + TEXTURE_DEPTH_TYPE = 0x8C16, + UNSIGNED_NORMALIZED = 0x8C17, + FRAMEBUFFER_BINDING = 0x8CA6, + DRAW_FRAMEBUFFER_BINDING = 0x8CA6, + RENDERBUFFER_BINDING = 0x8CA7, + READ_FRAMEBUFFER = 0x8CA8, + DRAW_FRAMEBUFFER = 0x8CA9, + READ_FRAMEBUFFER_BINDING = 0x8CAA, + RENDERBUFFER_SAMPLES = 0x8CAB, + FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE = 0x8CD0, + FRAMEBUFFER_ATTACHMENT_OBJECT_NAME = 0x8CD1, + FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL = 0x8CD2, + FRAMEBUFFER_ATTACHMENT_TEXTURE_CUBE_MAP_FACE = 0x8CD3, + FRAMEBUFFER_ATTACHMENT_TEXTURE_LAYER = 0x8CD4, + FRAMEBUFFER_COMPLETE = 0x8CD5, + FRAMEBUFFER_INCOMPLETE_ATTACHMENT = 0x8CD6, + FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT = 0x8CD7, + FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER = 0x8CDB, + FRAMEBUFFER_INCOMPLETE_READ_BUFFER = 0x8CDC, + FRAMEBUFFER_UNSUPPORTED = 0x8CDD, + MAX_COLOR_ATTACHMENTS = 0x8CDF, + COLOR_ATTACHMENT0 = 0x8CE0, + COLOR_ATTACHMENT1 = 0x8CE1, + COLOR_ATTACHMENT2 = 0x8CE2, + COLOR_ATTACHMENT3 = 0x8CE3, + COLOR_ATTACHMENT4 = 0x8CE4, + COLOR_ATTACHMENT5 = 0x8CE5, + COLOR_ATTACHMENT6 = 0x8CE6, + COLOR_ATTACHMENT7 = 0x8CE7, + COLOR_ATTACHMENT8 = 0x8CE8, + COLOR_ATTACHMENT9 = 0x8CE9, + COLOR_ATTACHMENT10 = 0x8CEA, + COLOR_ATTACHMENT11 = 0x8CEB, + COLOR_ATTACHMENT12 = 0x8CEC, + COLOR_ATTACHMENT13 = 0x8CED, + COLOR_ATTACHMENT14 = 0x8CEE, + COLOR_ATTACHMENT15 = 0x8CEF, + DEPTH_ATTACHMENT = 0x8D00, + STENCIL_ATTACHMENT = 0x8D20, + FRAMEBUFFER = 0x8D40, + RENDERBUFFER = 0x8D41, + RENDERBUFFER_WIDTH = 0x8D42, + RENDERBUFFER_HEIGHT = 0x8D43, + RENDERBUFFER_INTERNAL_FORMAT = 0x8D44, + STENCIL_INDEX1 = 0x8D46, + STENCIL_INDEX4 = 0x8D47, + STENCIL_INDEX8 = 0x8D48, + STENCIL_INDEX16 = 0x8D49, + RENDERBUFFER_RED_SIZE = 0x8D50, + RENDERBUFFER_GREEN_SIZE = 0x8D51, + RENDERBUFFER_BLUE_SIZE = 0x8D52, + RENDERBUFFER_ALPHA_SIZE = 0x8D53, + RENDERBUFFER_DEPTH_SIZE = 0x8D54, + RENDERBUFFER_STENCIL_SIZE = 0x8D55, + FRAMEBUFFER_INCOMPLETE_MULTISAMPLE = 0x8D56, + MAX_SAMPLES = 0x8D57, + TEXTURE_LUMINANCE_TYPE = 0x8C14, + TEXTURE_INTENSITY_TYPE = 0x8C15, + + // Core Extension: ARB_depth_buffer_float + DEPTH_COMPONENT32F = 0x8CAC, + DEPTH32F_STENCIL8 = 0x8CAD, + FLOAT_32_UNSIGNED_INT_24_8_REV = 0x8DAD, + + // Version: 3.0 + COMPARE_REF_TO_TEXTURE = 0x884E, + CLIP_DISTANCE0 = 0x3000, + CLIP_DISTANCE1 = 0x3001, + CLIP_DISTANCE2 = 0x3002, + CLIP_DISTANCE3 = 0x3003, + CLIP_DISTANCE4 = 0x3004, + CLIP_DISTANCE5 = 0x3005, + CLIP_DISTANCE6 = 0x3006, + CLIP_DISTANCE7 = 0x3007, + MAX_CLIP_DISTANCES = 0x0D32, + MAJOR_VERSION = 0x821B, + MINOR_VERSION = 0x821C, + NUM_EXTENSIONS = 0x821D, + CONTEXT_FLAGS = 0x821E, + COMPRESSED_RED = 0x8225, + COMPRESSED_RG = 0x8226, + CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT = 0x0001, + RGBA32F = 0x8814, + RGB32F = 0x8815, + RGBA16F = 0x881A, + RGB16F = 0x881B, + VERTEX_ATTRIB_ARRAY_INTEGER = 0x88FD, + MAX_ARRAY_TEXTURE_LAYERS = 0x88FF, + MIN_PROGRAM_TEXEL_OFFSET = 0x8904, + MAX_PROGRAM_TEXEL_OFFSET = 0x8905, + CLAMP_READ_COLOR = 0x891C, + FIXED_ONLY = 0x891D, + TEXTURE_1D_ARRAY = 0x8C18, + PROXY_TEXTURE_1D_ARRAY = 0x8C19, + TEXTURE_2D_ARRAY = 0x8C1A, + PROXY_TEXTURE_2D_ARRAY = 0x8C1B, + TEXTURE_BINDING_1D_ARRAY = 0x8C1C, + TEXTURE_BINDING_2D_ARRAY = 0x8C1D, + R11F_G11F_B10F = 0x8C3A, + UNSIGNED_INT_10F_11F_11F_REV = 0x8C3B, + RGB9_E5 = 0x8C3D, + UNSIGNED_INT_5_9_9_9_REV = 0x8C3E, + TEXTURE_SHARED_SIZE = 0x8C3F, + TRANSFORM_FEEDBACK_VARYING_MAX_LENGTH = 0x8C76, + TRANSFORM_FEEDBACK_BUFFER_MODE = 0x8C7F, + MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS = 0x8C80, + TRANSFORM_FEEDBACK_VARYINGS = 0x8C83, + TRANSFORM_FEEDBACK_BUFFER_START = 0x8C84, + TRANSFORM_FEEDBACK_BUFFER_SIZE = 0x8C85, + PRIMITIVES_GENERATED = 0x8C87, + TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN = 0x8C88, + RASTERIZER_DISCARD = 0x8C89, + MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS = 0x8C8A, + MAX_TRANSFORM_FEEDBACK_SEPARATE_ATTRIBS = 0x8C8B, + INTERLEAVED_ATTRIBS = 0x8C8C, + SEPARATE_ATTRIBS = 0x8C8D, + TRANSFORM_FEEDBACK_BUFFER = 0x8C8E, + TRANSFORM_FEEDBACK_BUFFER_BINDING = 0x8C8F, + RGBA32UI = 0x8D70, + RGB32UI = 0x8D71, + RGBA16UI = 0x8D76, + RGB16UI = 0x8D77, + RGBA8UI = 0x8D7C, + RGB8UI = 0x8D7D, + RGBA32I = 0x8D82, + RGB32I = 0x8D83, + RGBA16I = 0x8D88, + RGB16I = 0x8D89, + RGBA8I = 0x8D8E, + RGB8I = 0x8D8F, + RED_INTEGER = 0x8D94, + GREEN_INTEGER = 0x8D95, + BLUE_INTEGER = 0x8D96, + RGB_INTEGER = 0x8D98, + RGBA_INTEGER = 0x8D99, + BGR_INTEGER = 0x8D9A, + BGRA_INTEGER = 0x8D9B, + SAMPLER_1D_ARRAY = 0x8DC0, + SAMPLER_2D_ARRAY = 0x8DC1, + SAMPLER_1D_ARRAY_SHADOW = 0x8DC3, + SAMPLER_2D_ARRAY_SHADOW = 0x8DC4, + SAMPLER_CUBE_SHADOW = 0x8DC5, + UNSIGNED_INT_VEC2 = 0x8DC6, + UNSIGNED_INT_VEC3 = 0x8DC7, + UNSIGNED_INT_VEC4 = 0x8DC8, + INT_SAMPLER_1D = 0x8DC9, + INT_SAMPLER_2D = 0x8DCA, + INT_SAMPLER_3D = 0x8DCB, + INT_SAMPLER_CUBE = 0x8DCC, + INT_SAMPLER_1D_ARRAY = 0x8DCE, + INT_SAMPLER_2D_ARRAY = 0x8DCF, + UNSIGNED_INT_SAMPLER_1D = 0x8DD1, + UNSIGNED_INT_SAMPLER_2D = 0x8DD2, + UNSIGNED_INT_SAMPLER_3D = 0x8DD3, + UNSIGNED_INT_SAMPLER_CUBE = 0x8DD4, + UNSIGNED_INT_SAMPLER_1D_ARRAY = 0x8DD6, + UNSIGNED_INT_SAMPLER_2D_ARRAY = 0x8DD7, + QUERY_WAIT = 0x8E13, + QUERY_NO_WAIT = 0x8E14, + QUERY_BY_REGION_WAIT = 0x8E15, + QUERY_BY_REGION_NO_WAIT = 0x8E16, + BUFFER_ACCESS_FLAGS = 0x911F, + BUFFER_MAP_LENGTH = 0x9120, + BUFFER_MAP_OFFSET = 0x9121, + + // Core Extension: ARB_uniform_buffer_object + UNIFORM_BUFFER = 0x8A11, + UNIFORM_BUFFER_BINDING = 0x8A28, + UNIFORM_BUFFER_START = 0x8A29, + UNIFORM_BUFFER_SIZE = 0x8A2A, + MAX_VERTEX_UNIFORM_BLOCKS = 0x8A2B, + MAX_FRAGMENT_UNIFORM_BLOCKS = 0x8A2D, + MAX_COMBINED_UNIFORM_BLOCKS = 0x8A2E, + MAX_UNIFORM_BUFFER_BINDINGS = 0x8A2F, + MAX_UNIFORM_BLOCK_SIZE = 0x8A30, + MAX_COMBINED_VERTEX_UNIFORM_COMPONENTS = 0x8A31, + MAX_COMBINED_FRAGMENT_UNIFORM_COMPONENTS = 0x8A33, + UNIFORM_BUFFER_OFFSET_ALIGNMENT = 0x8A34, + ACTIVE_UNIFORM_BLOCK_MAX_NAME_LENGTH = 0x8A35, + ACTIVE_UNIFORM_BLOCKS = 0x8A36, + UNIFORM_TYPE = 0x8A37, + UNIFORM_SIZE = 0x8A38, + UNIFORM_NAME_LENGTH = 0x8A39, + UNIFORM_BLOCK_INDEX = 0x8A3A, + UNIFORM_OFFSET = 0x8A3B, + UNIFORM_ARRAY_STRIDE = 0x8A3C, + UNIFORM_MATRIX_STRIDE = 0x8A3D, + UNIFORM_IS_ROW_MAJOR = 0x8A3E, + UNIFORM_BLOCK_BINDING = 0x8A3F, + UNIFORM_BLOCK_DATA_SIZE = 0x8A40, + UNIFORM_BLOCK_NAME_LENGTH = 0x8A41, + UNIFORM_BLOCK_ACTIVE_UNIFORMS = 0x8A42, + UNIFORM_BLOCK_ACTIVE_UNIFORM_INDICES = 0x8A43, + UNIFORM_BLOCK_REFERENCED_BY_VERTEX_SHADER = 0x8A44, + UNIFORM_BLOCK_REFERENCED_BY_FRAGMENT_SHADER = 0x8A46, + INVALID_INDEX = 0xFFFFFFFF, + MAX_GEOMETRY_UNIFORM_BLOCKS = 0x8A2C, + MAX_COMBINED_GEOMETRY_UNIFORM_COMPONENTS = 0x8A32, + UNIFORM_BLOCK_REFERENCED_BY_GEOMETRY_SHADER = 0x8A45, + + // Core Extension: ARB_copy_buffer + COPY_READ_BUFFER = 0x8F36, + COPY_WRITE_BUFFER = 0x8F37, + COPY_READ_BUFFER_BINDING = 0x8F36, + COPY_WRITE_BUFFER_BINDING = 0x8F37, + + // Version: 3.1 + SAMPLER_2D_RECT = 0x8B63, + SAMPLER_2D_RECT_SHADOW = 0x8B64, + SAMPLER_BUFFER = 0x8DC2, + INT_SAMPLER_2D_RECT = 0x8DCD, + INT_SAMPLER_BUFFER = 0x8DD0, + UNSIGNED_INT_SAMPLER_2D_RECT = 0x8DD5, + UNSIGNED_INT_SAMPLER_BUFFER = 0x8DD8, + TEXTURE_BUFFER = 0x8C2A, + MAX_TEXTURE_BUFFER_SIZE = 0x8C2B, + TEXTURE_BINDING_BUFFER = 0x8C2C, + TEXTURE_BUFFER_DATA_STORE_BINDING = 0x8C2D, + TEXTURE_RECTANGLE = 0x84F5, + TEXTURE_BINDING_RECTANGLE = 0x84F6, + PROXY_TEXTURE_RECTANGLE = 0x84F7, + MAX_RECTANGLE_TEXTURE_SIZE = 0x84F8, + RED_SNORM = 0x8F90, + RG_SNORM = 0x8F91, + RGB_SNORM = 0x8F92, + RGBA_SNORM = 0x8F93, + R8_SNORM = 0x8F94, + RG8_SNORM = 0x8F95, + RGB8_SNORM = 0x8F96, + RGBA8_SNORM = 0x8F97, + R16_SNORM = 0x8F98, + RG16_SNORM = 0x8F99, + RGB16_SNORM = 0x8F9A, + RGBA16_SNORM = 0x8F9B, + SIGNED_NORMALIZED = 0x8F9C, + PRIMITIVE_RESTART = 0x8F9D, + PRIMITIVE_RESTART_INDEX = 0x8F9E, + + // Legacy + VERTEX_ARRAY = 0x8074, + NORMAL_ARRAY = 0x8075, + COLOR_ARRAY = 0x8076, + TEXTURE_COORD_ARRAY = 0x8078, + TEXTURE_ENV = 0x2300, + TEXTURE_ENV_MODE = 0x2200, + MODELVIEW = 0x1700, + PROJECTION = 0x1701, + LIGHTING = 0x0B50 + }; + + // Extension: 1.1 + extern void (CODEGEN_FUNCPTR *CullFace)(GLenum mode); + extern void (CODEGEN_FUNCPTR *FrontFace)(GLenum mode); + extern void (CODEGEN_FUNCPTR *Hint)(GLenum target, GLenum mode); + extern void (CODEGEN_FUNCPTR *LineWidth)(GLfloat width); + extern void (CODEGEN_FUNCPTR *PointSize)(GLfloat size); + extern void (CODEGEN_FUNCPTR *PolygonMode)(GLenum face, GLenum mode); + extern void (CODEGEN_FUNCPTR *Scissor)(GLint x, GLint y, GLsizei width, GLsizei height); + extern void (CODEGEN_FUNCPTR *TexParameterf)(GLenum target, GLenum pname, GLfloat param); + extern void (CODEGEN_FUNCPTR *TexParameterfv)(GLenum target, GLenum pname, const GLfloat *params); + extern void (CODEGEN_FUNCPTR *TexParameteri)(GLenum target, GLenum pname, GLint param); + extern void (CODEGEN_FUNCPTR *TexParameteriv)(GLenum target, GLenum pname, const GLint *params); + extern void (CODEGEN_FUNCPTR *TexImage1D)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLint border, GLenum format, GLenum type, const GLvoid *pixels); + extern void (CODEGEN_FUNCPTR *TexImage2D)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const GLvoid *pixels); + extern void (CODEGEN_FUNCPTR *DrawBuffer)(GLenum mode); + extern void (CODEGEN_FUNCPTR *Clear)(GLbitfield mask); + extern void (CODEGEN_FUNCPTR *ClearColor)(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha); + extern void (CODEGEN_FUNCPTR *ClearStencil)(GLint s); + extern void (CODEGEN_FUNCPTR *ClearDepth)(GLdouble depth); + extern void (CODEGEN_FUNCPTR *StencilMask)(GLuint mask); + extern void (CODEGEN_FUNCPTR *ColorMask)(GLboolean red, GLboolean green, GLboolean blue, GLboolean alpha); + extern void (CODEGEN_FUNCPTR *DepthMask)(GLboolean flag); + extern void (CODEGEN_FUNCPTR *Disable)(GLenum cap); + extern void (CODEGEN_FUNCPTR *Enable)(GLenum cap); + extern void (CODEGEN_FUNCPTR *Finish)(); + extern void (CODEGEN_FUNCPTR *Flush)(); + extern void (CODEGEN_FUNCPTR *BlendFunc)(GLenum sfactor, GLenum dfactor); + extern void (CODEGEN_FUNCPTR *LogicOp)(GLenum opcode); + extern void (CODEGEN_FUNCPTR *StencilFunc)(GLenum func, GLint ref, GLuint mask); + extern void (CODEGEN_FUNCPTR *StencilOp)(GLenum fail, GLenum zfail, GLenum zpass); + extern void (CODEGEN_FUNCPTR *DepthFunc)(GLenum func); + extern void (CODEGEN_FUNCPTR *PixelStoref)(GLenum pname, GLfloat param); + extern void (CODEGEN_FUNCPTR *PixelStorei)(GLenum pname, GLint param); + extern void (CODEGEN_FUNCPTR *ReadBuffer)(GLenum mode); + extern void (CODEGEN_FUNCPTR *ReadPixels)(GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, GLvoid *pixels); + extern void (CODEGEN_FUNCPTR *GetBooleanv)(GLenum pname, GLboolean *params); + extern void (CODEGEN_FUNCPTR *GetDoublev)(GLenum pname, GLdouble *params); + extern GLenum (CODEGEN_FUNCPTR *GetError)(); + extern void (CODEGEN_FUNCPTR *GetFloatv)(GLenum pname, GLfloat *params); + extern void (CODEGEN_FUNCPTR *GetIntegerv)(GLenum pname, GLint *params); + extern const GLubyte * (CODEGEN_FUNCPTR *GetString)(GLenum name); + extern void (CODEGEN_FUNCPTR *GetTexImage)(GLenum target, GLint level, GLenum format, GLenum type, GLvoid *pixels); + extern void (CODEGEN_FUNCPTR *GetTexParameterfv)(GLenum target, GLenum pname, GLfloat *params); + extern void (CODEGEN_FUNCPTR *GetTexParameteriv)(GLenum target, GLenum pname, GLint *params); + extern void (CODEGEN_FUNCPTR *GetTexLevelParameterfv)(GLenum target, GLint level, GLenum pname, GLfloat *params); + extern void (CODEGEN_FUNCPTR *GetTexLevelParameteriv)(GLenum target, GLint level, GLenum pname, GLint *params); + extern GLboolean (CODEGEN_FUNCPTR *IsEnabled)(GLenum cap); + extern void (CODEGEN_FUNCPTR *DepthRange)(GLdouble ren_near, GLdouble ren_far); + extern void (CODEGEN_FUNCPTR *Viewport)(GLint x, GLint y, GLsizei width, GLsizei height); + extern void (CODEGEN_FUNCPTR *DrawArrays)(GLenum mode, GLint first, GLsizei count); + extern void (CODEGEN_FUNCPTR *DrawElements)(GLenum mode, GLsizei count, GLenum type, const GLvoid *indices); + extern void (CODEGEN_FUNCPTR *GetPointerv)(GLenum pname, GLvoid* *params); + extern void (CODEGEN_FUNCPTR *PolygonOffset)(GLfloat factor, GLfloat units); + extern void (CODEGEN_FUNCPTR *CopyTexImage1D)(GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLint border); + extern void (CODEGEN_FUNCPTR *CopyTexImage2D)(GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLsizei height, GLint border); + extern void (CODEGEN_FUNCPTR *CopyTexSubImage1D)(GLenum target, GLint level, GLint xoffset, GLint x, GLint y, GLsizei width); + extern void (CODEGEN_FUNCPTR *CopyTexSubImage2D)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint x, GLint y, GLsizei width, GLsizei height); + extern void (CODEGEN_FUNCPTR *TexSubImage1D)(GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLenum type, const GLvoid *pixels); + extern void (CODEGEN_FUNCPTR *TexSubImage2D)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid *pixels); + extern void (CODEGEN_FUNCPTR *BindTexture)(GLenum target, GLuint texture); + extern void (CODEGEN_FUNCPTR *DeleteTextures)(GLsizei n, const GLuint *textures); + extern void (CODEGEN_FUNCPTR *GenTextures)(GLsizei n, GLuint *textures); + extern GLboolean (CODEGEN_FUNCPTR *IsTexture)(GLuint texture); + extern void (CODEGEN_FUNCPTR *Indexub)(GLubyte c); + extern void (CODEGEN_FUNCPTR *Indexubv)(const GLubyte *c); + + // Extension: 1.2 + extern void (CODEGEN_FUNCPTR *BlendColor)(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha); + extern void (CODEGEN_FUNCPTR *BlendEquation)(GLenum mode); + extern void (CODEGEN_FUNCPTR *DrawRangeElements)(GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid *indices); + extern void (CODEGEN_FUNCPTR *TexSubImage3D)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid *pixels); + extern void (CODEGEN_FUNCPTR *CopyTexSubImage3D)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height); + + // Extension: 1.3 + extern void (CODEGEN_FUNCPTR *ActiveTexture)(GLenum texture); + extern void (CODEGEN_FUNCPTR *SampleCoverage)(GLfloat value, GLboolean invert); + extern void (CODEGEN_FUNCPTR *CompressedTexImage3D)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid *data); + extern void (CODEGEN_FUNCPTR *CompressedTexImage2D)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const GLvoid *data); + extern void (CODEGEN_FUNCPTR *CompressedTexImage1D)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLint border, GLsizei imageSize, const GLvoid *data); + extern void (CODEGEN_FUNCPTR *CompressedTexSubImage3D)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const GLvoid *data); + extern void (CODEGEN_FUNCPTR *CompressedTexSubImage2D)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLsizei imageSize, const GLvoid *data); + extern void (CODEGEN_FUNCPTR *CompressedTexSubImage1D)(GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLsizei imageSize, const GLvoid *data); + extern void (CODEGEN_FUNCPTR *GetCompressedTexImage)(GLenum target, GLint level, GLvoid *img); + + // Extension: 1.4 + extern void (CODEGEN_FUNCPTR *BlendFuncSeparate)(GLenum sfactorRGB, GLenum dfactorRGB, GLenum sfactorAlpha, GLenum dfactorAlpha); + extern void (CODEGEN_FUNCPTR *MultiDrawArrays)(GLenum mode, const GLint *first, const GLsizei *count, GLsizei drawcount); + extern void (CODEGEN_FUNCPTR *MultiDrawElements)(GLenum mode, const GLsizei *count, GLenum type, const GLvoid* const *indices, GLsizei drawcount); + extern void (CODEGEN_FUNCPTR *PointParameterf)(GLenum pname, GLfloat param); + extern void (CODEGEN_FUNCPTR *PointParameterfv)(GLenum pname, const GLfloat *params); + extern void (CODEGEN_FUNCPTR *PointParameteri)(GLenum pname, GLint param); + extern void (CODEGEN_FUNCPTR *PointParameteriv)(GLenum pname, const GLint *params); + + // Extension: 1.5 + extern void (CODEGEN_FUNCPTR *GenQueries)(GLsizei n, GLuint *ids); + extern void (CODEGEN_FUNCPTR *DeleteQueries)(GLsizei n, const GLuint *ids); + extern GLboolean (CODEGEN_FUNCPTR *IsQuery)(GLuint id); + extern void (CODEGEN_FUNCPTR *BeginQuery)(GLenum target, GLuint id); + extern void (CODEGEN_FUNCPTR *EndQuery)(GLenum target); + extern void (CODEGEN_FUNCPTR *GetQueryiv)(GLenum target, GLenum pname, GLint *params); + extern void (CODEGEN_FUNCPTR *GetQueryObjectiv)(GLuint id, GLenum pname, GLint *params); + extern void (CODEGEN_FUNCPTR *GetQueryObjectuiv)(GLuint id, GLenum pname, GLuint *params); + extern void (CODEGEN_FUNCPTR *BindBuffer)(GLenum target, GLuint buffer); + extern void (CODEGEN_FUNCPTR *DeleteBuffers)(GLsizei n, const GLuint *buffers); + extern void (CODEGEN_FUNCPTR *GenBuffers)(GLsizei n, GLuint *buffers); + extern GLboolean (CODEGEN_FUNCPTR *IsBuffer)(GLuint buffer); + extern void (CODEGEN_FUNCPTR *BufferData)(GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage); + extern void (CODEGEN_FUNCPTR *BufferSubData)(GLenum target, GLintptr offset, GLsizeiptr size, const GLvoid *data); + extern void (CODEGEN_FUNCPTR *GetBufferSubData)(GLenum target, GLintptr offset, GLsizeiptr size, GLvoid *data); + extern GLvoid* (CODEGEN_FUNCPTR *MapBuffer)(GLenum target, GLenum access); + extern GLboolean (CODEGEN_FUNCPTR *UnmapBuffer)(GLenum target); + extern void (CODEGEN_FUNCPTR *GetBufferParameteriv)(GLenum target, GLenum pname, GLint *params); + extern void (CODEGEN_FUNCPTR *GetBufferPointerv)(GLenum target, GLenum pname, GLvoid* *params); + + // Extension: 2.0 + extern void (CODEGEN_FUNCPTR *BlendEquationSeparate)(GLenum modeRGB, GLenum modeAlpha); + extern void (CODEGEN_FUNCPTR *DrawBuffers)(GLsizei n, const GLenum *bufs); + extern void (CODEGEN_FUNCPTR *StencilOpSeparate)(GLenum face, GLenum sfail, GLenum dpfail, GLenum dppass); + extern void (CODEGEN_FUNCPTR *StencilFuncSeparate)(GLenum face, GLenum func, GLint ref, GLuint mask); + extern void (CODEGEN_FUNCPTR *StencilMaskSeparate)(GLenum face, GLuint mask); + extern void (CODEGEN_FUNCPTR *AttachShader)(GLuint program, GLuint shader); + extern void (CODEGEN_FUNCPTR *BindAttribLocation)(GLuint program, GLuint index, const GLchar *name); + extern void (CODEGEN_FUNCPTR *CompileShader)(GLuint shader); + extern GLuint (CODEGEN_FUNCPTR *CreateProgram)(); + extern GLuint (CODEGEN_FUNCPTR *CreateShader)(GLenum type); + extern void (CODEGEN_FUNCPTR *DeleteProgram)(GLuint program); + extern void (CODEGEN_FUNCPTR *DeleteShader)(GLuint shader); + extern void (CODEGEN_FUNCPTR *DetachShader)(GLuint program, GLuint shader); + extern void (CODEGEN_FUNCPTR *DisableVertexAttribArray)(GLuint index); + extern void (CODEGEN_FUNCPTR *EnableVertexAttribArray)(GLuint index); + extern void (CODEGEN_FUNCPTR *GetActiveAttrib)(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLint *size, GLenum *type, GLchar *name); + extern void (CODEGEN_FUNCPTR *GetActiveUniform)(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLint *size, GLenum *type, GLchar *name); + extern void (CODEGEN_FUNCPTR *GetAttachedShaders)(GLuint program, GLsizei maxCount, GLsizei *count, GLuint *obj); + extern GLint (CODEGEN_FUNCPTR *GetAttribLocation)(GLuint program, const GLchar *name); + extern void (CODEGEN_FUNCPTR *GetProgramiv)(GLuint program, GLenum pname, GLint *params); + extern void (CODEGEN_FUNCPTR *GetProgramInfoLog)(GLuint program, GLsizei bufSize, GLsizei *length, GLchar *infoLog); + extern void (CODEGEN_FUNCPTR *GetShaderiv)(GLuint shader, GLenum pname, GLint *params); + extern void (CODEGEN_FUNCPTR *GetShaderInfoLog)(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *infoLog); + extern void (CODEGEN_FUNCPTR *GetShaderSource)(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *source); + extern GLint (CODEGEN_FUNCPTR *GetUniformLocation)(GLuint program, const GLchar *name); + extern void (CODEGEN_FUNCPTR *GetUniformfv)(GLuint program, GLint location, GLfloat *params); + extern void (CODEGEN_FUNCPTR *GetUniformiv)(GLuint program, GLint location, GLint *params); + extern void (CODEGEN_FUNCPTR *GetVertexAttribdv)(GLuint index, GLenum pname, GLdouble *params); + extern void (CODEGEN_FUNCPTR *GetVertexAttribfv)(GLuint index, GLenum pname, GLfloat *params); + extern void (CODEGEN_FUNCPTR *GetVertexAttribiv)(GLuint index, GLenum pname, GLint *params); + extern void (CODEGEN_FUNCPTR *GetVertexAttribPointerv)(GLuint index, GLenum pname, GLvoid* *pointer); + extern GLboolean (CODEGEN_FUNCPTR *IsProgram)(GLuint program); + extern GLboolean (CODEGEN_FUNCPTR *IsShader)(GLuint shader); + extern void (CODEGEN_FUNCPTR *LinkProgram)(GLuint program); + extern void (CODEGEN_FUNCPTR *ShaderSource)(GLuint shader, GLsizei count, const GLchar* const *string, const GLint *length); + extern void (CODEGEN_FUNCPTR *UseProgram)(GLuint program); + extern void (CODEGEN_FUNCPTR *Uniform1f)(GLint location, GLfloat v0); + extern void (CODEGEN_FUNCPTR *Uniform2f)(GLint location, GLfloat v0, GLfloat v1); + extern void (CODEGEN_FUNCPTR *Uniform3f)(GLint location, GLfloat v0, GLfloat v1, GLfloat v2); + extern void (CODEGEN_FUNCPTR *Uniform4f)(GLint location, GLfloat v0, GLfloat v1, GLfloat v2, GLfloat v3); + extern void (CODEGEN_FUNCPTR *Uniform1i)(GLint location, GLint v0); + extern void (CODEGEN_FUNCPTR *Uniform2i)(GLint location, GLint v0, GLint v1); + extern void (CODEGEN_FUNCPTR *Uniform3i)(GLint location, GLint v0, GLint v1, GLint v2); + extern void (CODEGEN_FUNCPTR *Uniform4i)(GLint location, GLint v0, GLint v1, GLint v2, GLint v3); + extern void (CODEGEN_FUNCPTR *Uniform1fv)(GLint location, GLsizei count, const GLfloat *value); + extern void (CODEGEN_FUNCPTR *Uniform2fv)(GLint location, GLsizei count, const GLfloat *value); + extern void (CODEGEN_FUNCPTR *Uniform3fv)(GLint location, GLsizei count, const GLfloat *value); + extern void (CODEGEN_FUNCPTR *Uniform4fv)(GLint location, GLsizei count, const GLfloat *value); + extern void (CODEGEN_FUNCPTR *Uniform1iv)(GLint location, GLsizei count, const GLint *value); + extern void (CODEGEN_FUNCPTR *Uniform2iv)(GLint location, GLsizei count, const GLint *value); + extern void (CODEGEN_FUNCPTR *Uniform3iv)(GLint location, GLsizei count, const GLint *value); + extern void (CODEGEN_FUNCPTR *Uniform4iv)(GLint location, GLsizei count, const GLint *value); + extern void (CODEGEN_FUNCPTR *UniformMatrix2fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); + extern void (CODEGEN_FUNCPTR *UniformMatrix3fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); + extern void (CODEGEN_FUNCPTR *UniformMatrix4fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); + extern void (CODEGEN_FUNCPTR *ValidateProgram)(GLuint program); + extern void (CODEGEN_FUNCPTR *VertexAttribPointer)(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const GLvoid *pointer); + + // Extension: 2.1 + extern void (CODEGEN_FUNCPTR *UniformMatrix2x3fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); + extern void (CODEGEN_FUNCPTR *UniformMatrix3x2fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); + extern void (CODEGEN_FUNCPTR *UniformMatrix2x4fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); + extern void (CODEGEN_FUNCPTR *UniformMatrix4x2fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); + extern void (CODEGEN_FUNCPTR *UniformMatrix3x4fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); + extern void (CODEGEN_FUNCPTR *UniformMatrix4x3fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); + + // Extension: ARB_vertex_array_object + extern void (CODEGEN_FUNCPTR *BindVertexArray)(GLuint ren_array); + extern void (CODEGEN_FUNCPTR *DeleteVertexArrays)(GLsizei n, const GLuint *arrays); + extern void (CODEGEN_FUNCPTR *GenVertexArrays)(GLsizei n, GLuint *arrays); + extern GLboolean (CODEGEN_FUNCPTR *IsVertexArray)(GLuint ren_array); + + // Extension: ARB_map_buffer_range + extern GLvoid* (CODEGEN_FUNCPTR *MapBufferRange)(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access); + extern void (CODEGEN_FUNCPTR *FlushMappedBufferRange)(GLenum target, GLintptr offset, GLsizeiptr length); + + // Extension: ARB_framebuffer_object + extern GLboolean (CODEGEN_FUNCPTR *IsRenderbuffer)(GLuint renderbuffer); + extern void (CODEGEN_FUNCPTR *BindRenderbuffer)(GLenum target, GLuint renderbuffer); + extern void (CODEGEN_FUNCPTR *DeleteRenderbuffers)(GLsizei n, const GLuint *renderbuffers); + extern void (CODEGEN_FUNCPTR *GenRenderbuffers)(GLsizei n, GLuint *renderbuffers); + extern void (CODEGEN_FUNCPTR *RenderbufferStorage)(GLenum target, GLenum internalformat, GLsizei width, GLsizei height); + extern void (CODEGEN_FUNCPTR *GetRenderbufferParameteriv)(GLenum target, GLenum pname, GLint *params); + extern GLboolean (CODEGEN_FUNCPTR *IsFramebuffer)(GLuint framebuffer); + extern void (CODEGEN_FUNCPTR *BindFramebuffer)(GLenum target, GLuint framebuffer); + extern void (CODEGEN_FUNCPTR *DeleteFramebuffers)(GLsizei n, const GLuint *framebuffers); + extern void (CODEGEN_FUNCPTR *GenFramebuffers)(GLsizei n, GLuint *framebuffers); + extern GLenum (CODEGEN_FUNCPTR *CheckFramebufferStatus)(GLenum target); + extern void (CODEGEN_FUNCPTR *FramebufferTexture1D)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level); + extern void (CODEGEN_FUNCPTR *FramebufferTexture2D)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level); + extern void (CODEGEN_FUNCPTR *FramebufferTexture3D)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLint zoffset); + extern void (CODEGEN_FUNCPTR *FramebufferRenderbuffer)(GLenum target, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer); + extern void (CODEGEN_FUNCPTR *GetFramebufferAttachmentParameteriv)(GLenum target, GLenum attachment, GLenum pname, GLint *params); + extern void (CODEGEN_FUNCPTR *GenerateMipmap)(GLenum target); + extern void (CODEGEN_FUNCPTR *BlitFramebuffer)(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter); + extern void (CODEGEN_FUNCPTR *RenderbufferStorageMultisample)(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height); + extern void (CODEGEN_FUNCPTR *FramebufferTextureLayer)(GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer); + + // Extension: 3.0 + extern void (CODEGEN_FUNCPTR *ColorMaski)(GLuint index, GLboolean r, GLboolean g, GLboolean b, GLboolean a); + extern void (CODEGEN_FUNCPTR *GetBooleani_v)(GLenum target, GLuint index, GLboolean *data); + extern void (CODEGEN_FUNCPTR *GetIntegeri_v)(GLenum target, GLuint index, GLint *data); + extern void (CODEGEN_FUNCPTR *Enablei)(GLenum target, GLuint index); + extern void (CODEGEN_FUNCPTR *Disablei)(GLenum target, GLuint index); + extern GLboolean (CODEGEN_FUNCPTR *IsEnabledi)(GLenum target, GLuint index); + extern void (CODEGEN_FUNCPTR *BeginTransformFeedback)(GLenum primitiveMode); + extern void (CODEGEN_FUNCPTR *EndTransformFeedback)(); + extern void (CODEGEN_FUNCPTR *BindBufferRange)(GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size); + extern void (CODEGEN_FUNCPTR *BindBufferBase)(GLenum target, GLuint index, GLuint buffer); + extern void (CODEGEN_FUNCPTR *TransformFeedbackVaryings)(GLuint program, GLsizei count, const GLchar* const *varyings, GLenum bufferMode); + extern void (CODEGEN_FUNCPTR *GetTransformFeedbackVarying)(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLsizei *size, GLenum *type, GLchar *name); + extern void (CODEGEN_FUNCPTR *ClampColor)(GLenum target, GLenum clamp); + extern void (CODEGEN_FUNCPTR *BeginConditionalRender)(GLuint id, GLenum mode); + extern void (CODEGEN_FUNCPTR *EndConditionalRender)(); + extern void (CODEGEN_FUNCPTR *VertexAttribIPointer)(GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid *pointer); + extern void (CODEGEN_FUNCPTR *GetVertexAttribIiv)(GLuint index, GLenum pname, GLint *params); + extern void (CODEGEN_FUNCPTR *GetVertexAttribIuiv)(GLuint index, GLenum pname, GLuint *params); + extern void (CODEGEN_FUNCPTR *VertexAttribI1i)(GLuint index, GLint x); + extern void (CODEGEN_FUNCPTR *VertexAttribI2i)(GLuint index, GLint x, GLint y); + extern void (CODEGEN_FUNCPTR *VertexAttribI3i)(GLuint index, GLint x, GLint y, GLint z); + extern void (CODEGEN_FUNCPTR *VertexAttribI4i)(GLuint index, GLint x, GLint y, GLint z, GLint w); + extern void (CODEGEN_FUNCPTR *VertexAttribI1ui)(GLuint index, GLuint x); + extern void (CODEGEN_FUNCPTR *VertexAttribI2ui)(GLuint index, GLuint x, GLuint y); + extern void (CODEGEN_FUNCPTR *VertexAttribI3ui)(GLuint index, GLuint x, GLuint y, GLuint z); + extern void (CODEGEN_FUNCPTR *VertexAttribI4ui)(GLuint index, GLuint x, GLuint y, GLuint z, GLuint w); + extern void (CODEGEN_FUNCPTR *VertexAttribI1iv)(GLuint index, const GLint *v); + extern void (CODEGEN_FUNCPTR *VertexAttribI2iv)(GLuint index, const GLint *v); + extern void (CODEGEN_FUNCPTR *VertexAttribI3iv)(GLuint index, const GLint *v); + extern void (CODEGEN_FUNCPTR *VertexAttribI4iv)(GLuint index, const GLint *v); + extern void (CODEGEN_FUNCPTR *VertexAttribI1uiv)(GLuint index, const GLuint *v); + extern void (CODEGEN_FUNCPTR *VertexAttribI2uiv)(GLuint index, const GLuint *v); + extern void (CODEGEN_FUNCPTR *VertexAttribI3uiv)(GLuint index, const GLuint *v); + extern void (CODEGEN_FUNCPTR *VertexAttribI4uiv)(GLuint index, const GLuint *v); + extern void (CODEGEN_FUNCPTR *VertexAttribI4bv)(GLuint index, const GLbyte *v); + extern void (CODEGEN_FUNCPTR *VertexAttribI4sv)(GLuint index, const GLshort *v); + extern void (CODEGEN_FUNCPTR *VertexAttribI4ubv)(GLuint index, const GLubyte *v); + extern void (CODEGEN_FUNCPTR *VertexAttribI4usv)(GLuint index, const GLushort *v); + extern void (CODEGEN_FUNCPTR *GetUniformuiv)(GLuint program, GLint location, GLuint *params); + extern void (CODEGEN_FUNCPTR *BindFragDataLocation)(GLuint program, GLuint color, const GLchar *name); + extern GLint (CODEGEN_FUNCPTR *GetFragDataLocation)(GLuint program, const GLchar *name); + extern void (CODEGEN_FUNCPTR *Uniform1ui)(GLint location, GLuint v0); + extern void (CODEGEN_FUNCPTR *Uniform2ui)(GLint location, GLuint v0, GLuint v1); + extern void (CODEGEN_FUNCPTR *Uniform3ui)(GLint location, GLuint v0, GLuint v1, GLuint v2); + extern void (CODEGEN_FUNCPTR *Uniform4ui)(GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3); + extern void (CODEGEN_FUNCPTR *Uniform1uiv)(GLint location, GLsizei count, const GLuint *value); + extern void (CODEGEN_FUNCPTR *Uniform2uiv)(GLint location, GLsizei count, const GLuint *value); + extern void (CODEGEN_FUNCPTR *Uniform3uiv)(GLint location, GLsizei count, const GLuint *value); + extern void (CODEGEN_FUNCPTR *Uniform4uiv)(GLint location, GLsizei count, const GLuint *value); + extern void (CODEGEN_FUNCPTR *TexParameterIiv)(GLenum target, GLenum pname, const GLint *params); + extern void (CODEGEN_FUNCPTR *TexParameterIuiv)(GLenum target, GLenum pname, const GLuint *params); + extern void (CODEGEN_FUNCPTR *GetTexParameterIiv)(GLenum target, GLenum pname, GLint *params); + extern void (CODEGEN_FUNCPTR *GetTexParameterIuiv)(GLenum target, GLenum pname, GLuint *params); + extern void (CODEGEN_FUNCPTR *ClearBufferiv)(GLenum buffer, GLint drawbuffer, const GLint *value); + extern void (CODEGEN_FUNCPTR *ClearBufferuiv)(GLenum buffer, GLint drawbuffer, const GLuint *value); + extern void (CODEGEN_FUNCPTR *ClearBufferfv)(GLenum buffer, GLint drawbuffer, const GLfloat *value); + extern void (CODEGEN_FUNCPTR *ClearBufferfi)(GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil); + extern const GLubyte * (CODEGEN_FUNCPTR *GetStringi)(GLenum name, GLuint index); + + // Extension: ARB_uniform_buffer_object + extern void (CODEGEN_FUNCPTR *GetUniformIndices)(GLuint program, GLsizei uniformCount, const GLchar* const *uniformNames, GLuint *uniformIndices); + extern void (CODEGEN_FUNCPTR *GetActiveUniformsiv)(GLuint program, GLsizei uniformCount, const GLuint *uniformIndices, GLenum pname, GLint *params); + extern void (CODEGEN_FUNCPTR *GetActiveUniformName)(GLuint program, GLuint uniformIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformName); + extern GLuint (CODEGEN_FUNCPTR *GetUniformBlockIndex)(GLuint program, const GLchar *uniformBlockName); + extern void (CODEGEN_FUNCPTR *GetActiveUniformBlockiv)(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params); + extern void (CODEGEN_FUNCPTR *GetActiveUniformBlockName)(GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformBlockName); + extern void (CODEGEN_FUNCPTR *UniformBlockBinding)(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding); + + // Extension: ARB_copy_buffer + extern void (CODEGEN_FUNCPTR *CopyBufferSubData)(GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size); + + // Extension: 3.1 + extern void (CODEGEN_FUNCPTR *DrawArraysInstanced)(GLenum mode, GLint first, GLsizei count, GLsizei instancecount); + extern void (CODEGEN_FUNCPTR *DrawElementsInstanced)(GLenum mode, GLsizei count, GLenum type, const GLvoid *indices, GLsizei instancecount); + extern void (CODEGEN_FUNCPTR *TexBuffer)(GLenum target, GLenum internalformat, GLuint buffer); + extern void (CODEGEN_FUNCPTR *PrimitiveRestartIndex)(GLuint index); + + // Legacy + extern void (CODEGEN_FUNCPTR *EnableClientState)(GLenum cap); + extern void (CODEGEN_FUNCPTR *DisableClientState)(GLenum cap); + extern void (CODEGEN_FUNCPTR *VertexPointer)(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr); + extern void (CODEGEN_FUNCPTR *NormalPointer)(GLenum type, GLsizei stride, const GLvoid *ptr); + extern void (CODEGEN_FUNCPTR *ColorPointer)(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr); + extern void (CODEGEN_FUNCPTR *TexCoordPointer)(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr); + extern void (CODEGEN_FUNCPTR *TexEnvi)(GLenum target, GLenum pname, GLint param); + extern void (CODEGEN_FUNCPTR *MatrixMode)(GLenum mode); + extern void (CODEGEN_FUNCPTR *LoadIdentity)(void); + extern void (CODEGEN_FUNCPTR *Ortho)(GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble near_val, GLdouble far_val); + extern void (CODEGEN_FUNCPTR *Color3d)(GLdouble red, GLdouble green, GLdouble blue); +} + +#endif // OPENGL_NOLOAD_STYLE_HPP diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index fc291a862a..43f4d613bc 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -45,8 +45,7 @@ #include #ifdef HAVE_CUDA - #include - #include + #include #include #define CUDART_MINIMUM_REQUIRED_VERSION 4010 @@ -69,33 +68,89 @@ using namespace cv::gpu; namespace { - // Compares value to set using the given comparator. Returns true if - // there is at least one element x in the set satisfying to: x cmp value - // predicate. - template - bool compareToSet(const std::string& set_as_str, int value, Comparer cmp) + class CudaArch + { + public: + CudaArch(); + + bool builtWith(FeatureSet feature_set) const; + bool hasPtx(int major, int minor) const; + bool hasBin(int major, int minor) const; + bool hasEqualOrLessPtx(int major, int minor) const; + bool hasEqualOrGreaterPtx(int major, int minor) const; + bool hasEqualOrGreaterBin(int major, int minor) const; + + private: + static void fromStr(const string& set_as_str, vector& arr); + + vector bin; + vector ptx; + vector features; + }; + + const CudaArch cudaArch; + + CudaArch::CudaArch() + { + #ifdef HAVE_CUDA + fromStr(CUDA_ARCH_BIN, bin); + fromStr(CUDA_ARCH_PTX, ptx); + fromStr(CUDA_ARCH_FEATURES, features); + #endif + } + + bool CudaArch::builtWith(FeatureSet feature_set) const + { + return !features.empty() && (features.back() >= feature_set); + } + + bool CudaArch::hasPtx(int major, int minor) const + { + return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); + } + + bool CudaArch::hasBin(int major, int minor) const + { + return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); + } + + bool CudaArch::hasEqualOrLessPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.front() <= major * 10 + minor); + } + + bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.back() >= major * 10 + minor); + } + + bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const + { + return !bin.empty() && (bin.back() >= major * 10 + minor); + } + + void CudaArch::fromStr(const string& set_as_str, vector& arr) { if (set_as_str.find_first_not_of(" ") == string::npos) - return false; + return; - std::stringstream stream(set_as_str); + istringstream stream(set_as_str); int cur_value; while (!stream.eof()) { stream >> cur_value; - if (cmp(cur_value, value)) - return true; + arr.push_back(cur_value); } - return false; + sort(arr.begin(), arr.end()); } } bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_FEATURES, feature_set, std::greater_equal()); + return cudaArch.builtWith(feature_set); #else (void)feature_set; return false; @@ -110,7 +165,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor) bool cv::gpu::TargetArchs::hasPtx(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::equal_to()); + return cudaArch.hasPtx(major, minor); #else (void)major; (void)minor; @@ -121,7 +176,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor) bool cv::gpu::TargetArchs::hasBin(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, std::equal_to()); + return cudaArch.hasBin(major, minor); #else (void)major; (void)minor; @@ -132,8 +187,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, - std::less_equal()); + return cudaArch.hasEqualOrLessPtx(major, minor); #else (void)major; (void)minor; @@ -143,14 +197,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { - return hasEqualOrGreaterPtx(major, minor) || - hasEqualOrGreaterBin(major, minor); + return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); } bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::greater_equal()); + return cudaArch.hasEqualOrGreaterPtx(major, minor); #else (void)major; (void)minor; @@ -161,8 +214,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, - std::greater_equal()); + return cudaArch.hasEqualOrGreaterBin(major, minor); #else (void)major; (void)minor; @@ -170,6 +222,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) #endif } +bool cv::gpu::deviceSupports(FeatureSet feature_set) +{ + static int versions[] = + { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); + + const int devId = getDevice(); + + int version; + + if (devId < cache_size && versions[devId] >= 0) + version = versions[devId]; + else + { + DeviceInfo dev(devId); + version = dev.majorVersion() * 10 + dev.minorVersion(); + if (devId < cache_size) + versions[devId] = version; + } + + return TargetArchs::builtWith(feature_set) && (version >= feature_set); +} + #if !defined (HAVE_CUDA) #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") @@ -316,18 +393,6 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory) namespace { - template void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device) - { - *attribute = T(); - //CUresult error = CUDA_SUCCESS;// = cuDeviceGetAttribute( attribute, device_attribute, device ); why link erros under ubuntu?? - CUresult error = cuDeviceGetAttribute( attribute, device_attribute, device ); - if( CUDA_SUCCESS == error ) - return; - - printf("Driver API error = %04d\n", error); - cv::gpu::error("driver API error", __FILE__, __LINE__); - } - int convertSMVer2Cores(int major, int minor) { // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM @@ -336,7 +401,7 @@ namespace int Cores; } SMtoCores; - SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, { -1, -1 } }; + SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; int index = 0; while (gpuArchCoresPerSM[index].SM != -1) @@ -345,7 +410,7 @@ namespace return gpuArchCoresPerSM[index].Cores; index++; } - printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor); + return -1; } } @@ -383,22 +448,13 @@ void cv::gpu::printCudaDeviceInfo(int device) printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); - printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", - prop.multiProcessorCount, convertSMVer2Cores(prop.major, prop.minor), - convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount); + + int cores = convertSMVer2Cores(prop.major, prop.minor); + if (cores > 0) + printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount); + printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); - // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output - int memoryClock, memBusWidth, L2CacheSize; - getCudaAttribute( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); - getCudaAttribute( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev ); - getCudaAttribute( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev ); - - printf(" Memory Clock rate: %.2f Mhz\n", memoryClock * 1e-3f); - printf(" Memory Bus Width: %d-bit\n", memBusWidth); - if (L2CacheSize) - printf(" L2 Cache Size: %d bytes\n", L2CacheSize); - printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); @@ -458,7 +514,12 @@ void cv::gpu::printShortCudaDeviceInfo(int device) const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); - printf(", sm_%d%d%s, %d cores", prop.major, prop.minor, arch_str, convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount); + printf(", sm_%d%d%s", prop.major, prop.minor, arch_str); + + int cores = convertSMVer2Cores(prop.major, prop.minor); + if (cores > 0) + printf(", %d cores", cores * prop.multiProcessorCount); + printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); } fflush(stdout); @@ -704,6 +765,43 @@ cv::Mat::Mat(const GpuMat& m) : flags(0), dims(0), rows(0), cols(0), data(0), re m.download(*this); } +void cv::gpu::createContinuous(int rows, int cols, int type, GpuMat& m) +{ + int area = rows * cols; + if (m.empty() || m.type() != type || !m.isContinuous() || m.size().area() < area) + m.create(1, area, type); + + m.cols = cols; + m.rows = rows; + m.step = m.elemSize() * cols; + m.flags |= Mat::CONTINUOUS_FLAG; +} + +void cv::gpu::ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m) +{ + if (m.empty() || m.type() != type || m.data != m.datastart) + m.create(rows, cols, type); + else + { + const size_t esz = m.elemSize(); + const ptrdiff_t delta2 = m.dataend - m.datastart; + + const size_t minstep = m.cols * esz; + + Size wholeSize; + wholeSize.height = std::max(static_cast((delta2 - minstep) / m.step + 1), m.rows); + wholeSize.width = std::max(static_cast((delta2 - m.step * (wholeSize.height - 1)) / esz), m.cols); + + if (wholeSize.height < rows || wholeSize.width < cols) + m.create(rows, cols, type); + else + { + m.cols = cols; + m.rows = rows; + } + } +} + namespace { class GpuFuncTable diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index 42c287593f..0776ca6248 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -922,8 +922,8 @@ _InputArray::_InputArray(const Mat& m) : flags(MAT), obj((void*)&m) {} _InputArray::_InputArray(const vector& vec) : flags(STD_VECTOR_MAT), obj((void*)&vec) {} _InputArray::_InputArray(const double& val) : flags(FIXED_TYPE + FIXED_SIZE + MATX + CV_64F), obj((void*)&val), sz(Size(1,1)) {} _InputArray::_InputArray(const MatExpr& expr) : flags(FIXED_TYPE + FIXED_SIZE + EXPR), obj((void*)&expr) {} -_InputArray::_InputArray(const GlBuffer& buf) : flags(FIXED_TYPE + FIXED_SIZE + OPENGL_BUFFER), obj((void*)&buf) {} -_InputArray::_InputArray(const GlTexture& tex) : flags(FIXED_TYPE + FIXED_SIZE + OPENGL_TEXTURE), obj((void*)&tex) {} +_InputArray::_InputArray(const GlBuffer& buf) : flags(OPENGL_BUFFER), obj((void*)&buf) {} +_InputArray::_InputArray(const GlTexture2D &tex) : flags(OPENGL_TEXTURE2D), obj((void*)&tex) {} _InputArray::_InputArray(const gpu::GpuMat& d_mat) : flags(GPU_MAT), obj((void*)&d_mat) {} Mat _InputArray::getMat(int i) const @@ -1076,14 +1076,14 @@ GlBuffer _InputArray::getGlBuffer() const } } -GlTexture _InputArray::getGlTexture() const +GlTexture2D _InputArray::getGlTexture2D() const { int k = kind(); - CV_Assert(k == OPENGL_TEXTURE); + CV_Assert(k == OPENGL_TEXTURE2D); //if( k == OPENGL_TEXTURE ) { - const GlTexture* tex = (const GlTexture*)obj; + const GlTexture2D* tex = (const GlTexture2D*)obj; return *tex; } } @@ -1168,10 +1168,10 @@ Size _InputArray::size(int i) const return buf->size(); } - if( k == OPENGL_TEXTURE ) + if( k == OPENGL_TEXTURE2D ) { CV_Assert( i < 0 ); - const GlTexture* tex = (const GlTexture*)obj; + const GlTexture2D* tex = (const GlTexture2D*)obj; return tex->size(); } @@ -1216,9 +1216,6 @@ int _InputArray::type(int i) const if( k == OPENGL_BUFFER ) return ((const GlBuffer*)obj)->type(); - if( k == OPENGL_TEXTURE ) - return ((const GlTexture*)obj)->type(); - CV_Assert( k == GPU_MAT ); //if( k == GPU_MAT ) return ((const gpu::GpuMat*)obj)->type(); @@ -1271,8 +1268,8 @@ bool _InputArray::empty() const if( k == OPENGL_BUFFER ) return ((const GlBuffer*)obj)->empty(); - if( k == OPENGL_TEXTURE ) - return ((const GlTexture*)obj)->empty(); + if( k == OPENGL_TEXTURE2D ) + return ((const GlTexture2D*)obj)->empty(); CV_Assert( k == GPU_MAT ); //if( k == GPU_MAT ) @@ -1285,10 +1282,14 @@ _OutputArray::~_OutputArray() {} _OutputArray::_OutputArray(Mat& m) : _InputArray(m) {} _OutputArray::_OutputArray(vector& vec) : _InputArray(vec) {} _OutputArray::_OutputArray(gpu::GpuMat& d_mat) : _InputArray(d_mat) {} +_OutputArray::_OutputArray(GlBuffer& buf) : _InputArray(buf) {} +_OutputArray::_OutputArray(GlTexture2D& tex) : _InputArray(tex) {} _OutputArray::_OutputArray(const Mat& m) : _InputArray(m) {flags |= FIXED_SIZE|FIXED_TYPE;} _OutputArray::_OutputArray(const vector& vec) : _InputArray(vec) {flags |= FIXED_SIZE;} _OutputArray::_OutputArray(const gpu::GpuMat& d_mat) : _InputArray(d_mat) {flags |= FIXED_SIZE|FIXED_TYPE;} +_OutputArray::_OutputArray(const GlBuffer& buf) : _InputArray(buf) {flags |= FIXED_SIZE|FIXED_TYPE;} +_OutputArray::_OutputArray(const GlTexture2D& tex) : _InputArray(tex) {flags |= FIXED_SIZE|FIXED_TYPE;} bool _OutputArray::fixedSize() const @@ -1318,6 +1319,13 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int ((gpu::GpuMat*)obj)->create(_sz, mtype); return; } + if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 ) + { + CV_Assert(!fixedSize() || ((GlBuffer*)obj)->size() == _sz); + CV_Assert(!fixedType() || ((GlBuffer*)obj)->type() == mtype); + ((GlBuffer*)obj)->create(_sz, mtype); + return; + } int sizes[] = {_sz.height, _sz.width}; create(2, sizes, mtype, i, allowTransposed, fixedDepthMask); } @@ -1339,6 +1347,13 @@ void _OutputArray::create(int rows, int cols, int mtype, int i, bool allowTransp ((gpu::GpuMat*)obj)->create(rows, cols, mtype); return; } + if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 ) + { + CV_Assert(!fixedSize() || ((GlBuffer*)obj)->size() == Size(cols, rows)); + CV_Assert(!fixedType() || ((GlBuffer*)obj)->type() == mtype); + ((GlBuffer*)obj)->create(rows, cols, mtype); + return; + } int sizes[] = {rows, cols}; create(2, sizes, mtype, i, allowTransposed, fixedDepthMask); } @@ -1558,6 +1573,18 @@ void _OutputArray::release() const return; } + if( k == OPENGL_BUFFER ) + { + ((GlBuffer*)obj)->release(); + return; + } + + if( k == OPENGL_TEXTURE2D ) + { + ((GlTexture2D*)obj)->release(); + return; + } + if( k == NONE ) return; @@ -1623,6 +1650,20 @@ gpu::GpuMat& _OutputArray::getGpuMatRef() const return *(gpu::GpuMat*)obj; } +GlBuffer& _OutputArray::getGlBufferRef() const +{ + int k = kind(); + CV_Assert( k == OPENGL_BUFFER ); + return *(GlBuffer*)obj; +} + +GlTexture2D& _OutputArray::getGlTexture2DRef() const +{ + int k = kind(); + CV_Assert( k == OPENGL_TEXTURE2D ); + return *(GlTexture2D*)obj; +} + static _OutputArray _none; OutputArray noArray() { return _none; } diff --git a/modules/core/src/opengl_interop.cpp b/modules/core/src/opengl_interop.cpp index 12589b7ba3..befc63f3f7 100644 --- a/modules/core/src/opengl_interop.cpp +++ b/modules/core/src/opengl_interop.cpp @@ -41,26 +41,11 @@ //M*/ #include "precomp.hpp" -#include #include "opencv2/core/opengl_interop.hpp" #include "opencv2/core/gpumat.hpp" -#if defined WIN32 || defined _WIN32 || defined WINCE -#include -#undef small -#undef min -#undef max -#undef abs -#endif - #ifdef HAVE_OPENGL - #ifdef __APPLE__ - #include - #include - #else - #include - #include - #endif + #include "gl_core_3_1.hpp" #ifdef HAVE_CUDA #include @@ -72,213 +57,258 @@ using namespace std; using namespace cv; using namespace cv::gpu; -#ifndef HAVE_OPENGL - #define throw_nogl CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support") - #define throw_nocuda CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") -#else - #define throw_nogl CV_Error(CV_OpenGlNotSupported, "OpenGL context doesn't exist") - - #if !defined HAVE_CUDA || defined(CUDA_DISABLER) - #define throw_nocuda CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") +namespace +{ + #ifndef HAVE_OPENGL + void throw_nogl() { CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support"); } #else - #if defined(__GNUC__) - #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, __func__) - #else /* defined(__CUDACC__) || defined(__MSVC__) */ - #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__) - #endif + void throw_nogl() { CV_Error(CV_OpenGlApiCallError, "OpenGL context doesn't exist"); } - namespace - { - inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") + #ifndef HAVE_CUDA + void throw_nocuda() { CV_Error(CV_GpuNotSupported, "The library is compiled without GPU support"); } + #else + void throw_nocuda() { CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform"); } + + #if defined(__GNUC__) + #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, __func__) + #else /* defined(__CUDACC__) || defined(__MSVC__) */ + #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__) + #endif + + void ___cudaSafeCall(cudaError_t err, const char* file, const int line, const char* func = "") { if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func); } - } - #endif // HAVE_CUDA -#endif + #endif + #endif +} -namespace +bool cv::checkGlError(const char* file, const int line, const char* func) { - class EmptyGlFuncTab : public CvOpenGlFuncTab +#ifndef HAVE_OPENGL + (void) file; + (void) line; + (void) func; + return true; +#else + GLenum err = gl::GetError(); + + if (err != gl::NO_ERROR_) { - public: - void genBuffers(int, unsigned int*) const { throw_nogl; } - void deleteBuffers(int, const unsigned int*) const { throw_nogl; } + const char* msg; - void bufferData(unsigned int, ptrdiff_t, const void*, unsigned int) const { throw_nogl; } - void bufferSubData(unsigned int, ptrdiff_t, ptrdiff_t, const void*) const { throw_nogl; } + switch (err) + { + case gl::INVALID_ENUM: + msg = "An unacceptable value is specified for an enumerated argument"; + break; - void bindBuffer(unsigned int, unsigned int) const { throw_nogl; } + case gl::INVALID_VALUE: + msg = "A numeric argument is out of range"; + break; - void* mapBuffer(unsigned int, unsigned int) const { throw_nogl; return 0; } - void unmapBuffer(unsigned int) const { throw_nogl; } + case gl::INVALID_OPERATION: + msg = "The specified operation is not allowed in the current state"; + break; - void generateBitmapFont(const std::string&, int, int, bool, bool, int, int, int) const { throw_nogl; } + case gl::OUT_OF_MEMORY: + msg = "There is not enough memory left to execute the command"; + break; - bool isGlContextInitialized() const { return false; } - }; + default: + msg = "Unknown error"; + }; - const CvOpenGlFuncTab* g_glFuncTab = 0; + cvError(CV_OpenGlApiCallError, func, msg, file, line); -#if defined HAVE_CUDA || defined HAVE_OPENGL - const CvOpenGlFuncTab* glFuncTab() - { - static EmptyGlFuncTab empty; - return g_glFuncTab ? g_glFuncTab : ∅ + return false; } + + return true; #endif } -CvOpenGlFuncTab::~CvOpenGlFuncTab() -{ - if (g_glFuncTab == this) - g_glFuncTab = 0; -} - -void icvSetOpenGlFuncTab(const CvOpenGlFuncTab* tab) -{ - g_glFuncTab = tab; -} - #ifdef HAVE_OPENGL - #ifndef GL_DYNAMIC_DRAW - #define GL_DYNAMIC_DRAW 0x88E8 - #endif +namespace +{ + const GLenum gl_types[] = { gl::UNSIGNED_BYTE, gl::BYTE, gl::UNSIGNED_SHORT, gl::SHORT, gl::INT, gl::FLOAT, gl::DOUBLE }; +} +#endif - #ifndef GL_READ_WRITE - #define GL_READ_WRITE 0x88BA - #endif - - #ifndef GL_BGR - #define GL_BGR 0x80E0 - #endif - - #ifndef GL_BGRA - #define GL_BGRA 0x80E1 - #endif - - namespace - { - const GLenum gl_types[] = {GL_UNSIGNED_BYTE, GL_BYTE, GL_UNSIGNED_SHORT, GL_SHORT, GL_INT, GL_FLOAT, GL_DOUBLE}; - - #ifdef HAVE_CUDA - bool g_isCudaGlDeviceInitialized = false; - #endif - } -#endif // HAVE_OPENGL +//////////////////////////////////////////////////////////////////////// +// setGlDevice void cv::gpu::setGlDevice(int device) { -#if !defined HAVE_CUDA || defined(CUDA_DISABLER) - (void)device; - throw_nocuda; +#ifndef HAVE_OPENGL + (void) device; + throw_nogl(); #else - #ifndef HAVE_OPENGL - (void)device; - throw_nogl; + #if !defined(HAVE_CUDA) || defined(CUDA_DISABLER) + (void) device; + throw_nocuda(); #else - if (!glFuncTab()->isGlContextInitialized()) - throw_nogl; - cudaSafeCall( cudaGLSetGLDevice(device) ); - - g_isCudaGlDeviceInitialized = true; #endif #endif } //////////////////////////////////////////////////////////////////////// -// CudaGlInterop +// CudaResource + +#if defined(HAVE_OPENGL) && defined(HAVE_CUDA) && !defined(CUDA_DISABLER) -#if defined HAVE_CUDA && defined HAVE_OPENGL namespace { - class CudaGlInterop + class CudaResource { public: - CudaGlInterop(); - ~CudaGlInterop(); + CudaResource(); + ~CudaResource(); - void registerBuffer(unsigned int buffer); + void registerBuffer(GLuint buffer); + void release(); - void copyFrom(const GpuMat& mat, cudaStream_t stream = 0); + void copyFrom(const void* src, size_t spitch, size_t width, size_t height, cudaStream_t stream = 0); + void copyTo(void* dst, size_t dpitch, size_t width, size_t height, cudaStream_t stream = 0); - GpuMat map(int rows, int cols, int type, cudaStream_t stream = 0); + void* map(cudaStream_t stream = 0); void unmap(cudaStream_t stream = 0); private: cudaGraphicsResource_t resource_; + GLuint buffer_; + + class GraphicsMapHolder; }; - inline CudaGlInterop::CudaGlInterop() : resource_(0) + CudaResource::CudaResource() : resource_(0), buffer_(0) { } - CudaGlInterop::~CudaGlInterop() + CudaResource::~CudaResource() { - if (resource_) - { - cudaGraphicsUnregisterResource(resource_); - resource_ = 0; - } + release(); } - void CudaGlInterop::registerBuffer(unsigned int buffer) + void CudaResource::registerBuffer(GLuint buffer) { - if (!g_isCudaGlDeviceInitialized) - cvError(CV_GpuApiCallError, "registerBuffer", "cuda GL device wasn't initialized, call setGlDevice", __FILE__, __LINE__); + CV_DbgAssert( buffer != 0 ); + + if (buffer_ == buffer) + return; cudaGraphicsResource_t resource; cudaSafeCall( cudaGraphicsGLRegisterBuffer(&resource, buffer, cudaGraphicsMapFlagsNone) ); + release(); + resource_ = resource; + buffer_ = buffer; } - void CudaGlInterop::copyFrom(const GpuMat& mat, cudaStream_t stream) + void CudaResource::release() { - CV_Assert(resource_ != 0); + if (resource_) + cudaGraphicsUnregisterResource(resource_); - cudaSafeCall( cudaGraphicsMapResources(1, &resource_, stream) ); + resource_ = 0; + buffer_ = 0; + } - void* dst_ptr; - size_t num_bytes; - cudaSafeCall( cudaGraphicsResourceGetMappedPointer(&dst_ptr, &num_bytes, resource_) ); + class CudaResource::GraphicsMapHolder + { + public: + GraphicsMapHolder(cudaGraphicsResource_t* resource, cudaStream_t stream); + ~GraphicsMapHolder(); - const void* src_ptr = mat.ptr(); - size_t widthBytes = mat.cols * mat.elemSize(); + void reset(); - CV_Assert(widthBytes * mat.rows <= num_bytes); + private: + cudaGraphicsResource_t* resource_; + cudaStream_t stream_; + }; + + CudaResource::GraphicsMapHolder::GraphicsMapHolder(cudaGraphicsResource_t* resource, cudaStream_t stream) : resource_(resource), stream_(stream) + { + if (resource_) + cudaSafeCall( cudaGraphicsMapResources(1, resource_, stream_) ); + } + + CudaResource::GraphicsMapHolder::~GraphicsMapHolder() + { + if (resource_) + cudaGraphicsUnmapResources(1, resource_, stream_); + } + + void CudaResource::GraphicsMapHolder::reset() + { + resource_ = 0; + } + + void CudaResource::copyFrom(const void* src, size_t spitch, size_t width, size_t height, cudaStream_t stream) + { + CV_DbgAssert( resource_ != 0 ); + + GraphicsMapHolder h(&resource_, stream); + (void) h; + + void* dst; + size_t size; + cudaSafeCall( cudaGraphicsResourceGetMappedPointer(&dst, &size, resource_) ); + + CV_DbgAssert( width * height == size ); if (stream == 0) - cudaSafeCall( cudaMemcpy2D(dst_ptr, widthBytes, src_ptr, mat.step, widthBytes, mat.rows, cudaMemcpyDeviceToDevice) ); + cudaSafeCall( cudaMemcpy2D(dst, width, src, spitch, width, height, cudaMemcpyDeviceToDevice) ); else - cudaSafeCall( cudaMemcpy2DAsync(dst_ptr, widthBytes, src_ptr, mat.step, widthBytes, mat.rows, cudaMemcpyDeviceToDevice, stream) ); - - cudaGraphicsUnmapResources(1, &resource_, stream); + cudaSafeCall( cudaMemcpy2DAsync(dst, width, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream) ); } - GpuMat CudaGlInterop::map(int rows, int cols, int type, cudaStream_t stream) + void CudaResource::copyTo(void* dst, size_t dpitch, size_t width, size_t height, cudaStream_t stream) { - CV_Assert(resource_ != 0); + CV_DbgAssert( resource_ != 0 ); - cudaSafeCall( cudaGraphicsMapResources(1, &resource_, stream) ); + GraphicsMapHolder h(&resource_, stream); + (void) h; + + void* src; + size_t size; + cudaSafeCall( cudaGraphicsResourceGetMappedPointer(&src, &size, resource_) ); + + CV_DbgAssert( width * height == size ); + + if (stream == 0) + cudaSafeCall( cudaMemcpy2D(dst, dpitch, src, width, width, height, cudaMemcpyDeviceToDevice) ); + else + cudaSafeCall( cudaMemcpy2DAsync(dst, dpitch, src, width, width, height, cudaMemcpyDeviceToDevice, stream) ); + } + + void* CudaResource::map(cudaStream_t stream) + { + CV_DbgAssert( resource_ != 0 ); + + GraphicsMapHolder h(&resource_, stream); void* ptr; - size_t num_bytes; - cudaSafeCall( cudaGraphicsResourceGetMappedPointer(&ptr, &num_bytes, resource_) ); + size_t size; + cudaSafeCall( cudaGraphicsResourceGetMappedPointer(&ptr, &size, resource_) ); - CV_Assert( static_cast(cols) * CV_ELEM_SIZE(type) * rows <= num_bytes ); + h.reset(); - return GpuMat(rows, cols, type, ptr); + return ptr; } - inline void CudaGlInterop::unmap(cudaStream_t stream) + void CudaResource::unmap(cudaStream_t stream) { + CV_Assert( resource_ != 0 ); + cudaGraphicsUnmapResources(1, &resource_, stream); } } -#endif // HAVE_CUDA && HAVE_OPENGL + +#endif //////////////////////////////////////////////////////////////////////// // GlBuffer @@ -296,393 +326,466 @@ class cv::GlBuffer::Impl public: static const Ptr& empty(); - Impl(int rows, int cols, int type, unsigned int target); - Impl(const Mat& m, unsigned int target); + Impl(GLuint bufId, bool autoRelease); + Impl(GLsizeiptr size, const GLvoid* data, GLenum target, bool autoRelease); ~Impl(); - void copyFrom(const Mat& m, unsigned int target); + void bind(GLenum target) const; + + void copyFrom(GLuint srcBuf, GLsizeiptr size); + + void copyFrom(GLsizeiptr size, const GLvoid* data); + void copyTo(GLsizeiptr size, GLvoid* data) const; + + void* mapHost(GLenum access); + void unmapHost(); #ifdef HAVE_CUDA - void copyFrom(const GpuMat& mat, cudaStream_t stream = 0); -#endif + void copyFrom(const void* src, size_t spitch, size_t width, size_t height, cudaStream_t stream = 0); + void copyTo(void* dst, size_t dpitch, size_t width, size_t height, cudaStream_t stream = 0) const; - void bind(unsigned int target) const; - void unbind(unsigned int target) const; - - Mat mapHost(int rows, int cols, int type, unsigned int target); - void unmapHost(unsigned int target); - -#ifdef HAVE_CUDA - GpuMat mapDevice(int rows, int cols, int type, cudaStream_t stream = 0); + void* mapDevice(cudaStream_t stream = 0); void unmapDevice(cudaStream_t stream = 0); #endif + void setAutoRelease(bool flag) { autoRelease_ = flag; } + + GLuint bufId() const { return bufId_; } + private: Impl(); - unsigned int buffer_; + GLuint bufId_; + bool autoRelease_; #ifdef HAVE_CUDA - CudaGlInterop cudaGlInterop_; + mutable CudaResource cudaResource_; #endif }; -inline const Ptr& cv::GlBuffer::Impl::empty() +const Ptr& cv::GlBuffer::Impl::empty() { static Ptr p(new Impl); return p; } -inline cv::GlBuffer::Impl::Impl() : buffer_(0) +cv::GlBuffer::Impl::Impl() : bufId_(0), autoRelease_(true) { } -cv::GlBuffer::Impl::Impl(int rows, int cols, int type, unsigned int target) : buffer_(0) +cv::GlBuffer::Impl::Impl(GLuint abufId, bool autoRelease) : bufId_(abufId), autoRelease_(autoRelease) { - if (!glFuncTab()->isGlContextInitialized()) - throw_nogl; - - CV_DbgAssert(rows > 0 && cols > 0); - CV_DbgAssert(CV_MAT_DEPTH(type) >= 0 && CV_MAT_DEPTH(type) <= CV_64F); - - glFuncTab()->genBuffers(1, &buffer_); - CV_CheckGlError(); - CV_Assert(buffer_ != 0); - - size_t size = rows * cols * CV_ELEM_SIZE(type); - - glFuncTab()->bindBuffer(target, buffer_); - CV_CheckGlError(); - - glFuncTab()->bufferData(target, size, 0, GL_DYNAMIC_DRAW); - CV_CheckGlError(); - - glFuncTab()->bindBuffer(target, 0); - -#ifdef HAVE_CUDA - if (g_isCudaGlDeviceInitialized) - cudaGlInterop_.registerBuffer(buffer_); -#endif } -cv::GlBuffer::Impl::Impl(const Mat& m, unsigned int target) : buffer_(0) +cv::GlBuffer::Impl::Impl(GLsizeiptr size, const GLvoid* data, GLenum target, bool autoRelease) : bufId_(0), autoRelease_(autoRelease) { - if (!glFuncTab()->isGlContextInitialized()) - throw_nogl; - - CV_DbgAssert(m.rows > 0 && m.cols > 0); - CV_DbgAssert(m.depth() >= 0 && m.depth() <= CV_64F); - CV_Assert(m.isContinuous()); - - glFuncTab()->genBuffers(1, &buffer_); - CV_CheckGlError(); - CV_Assert(buffer_ != 0); - - size_t size = m.rows * m.cols * m.elemSize(); - - glFuncTab()->bindBuffer(target, buffer_); + gl::GenBuffers(1, &bufId_); CV_CheckGlError(); - glFuncTab()->bufferData(target, size, m.data, GL_DYNAMIC_DRAW); + CV_Assert( bufId_ != 0 ); + + gl::BindBuffer(target, bufId_); CV_CheckGlError(); - glFuncTab()->bindBuffer(target, 0); + gl::BufferData(target, size, data, gl::DYNAMIC_DRAW); + CV_CheckGlError(); -#ifdef HAVE_CUDA - if (g_isCudaGlDeviceInitialized) - cudaGlInterop_.registerBuffer(buffer_); -#endif + gl::BindBuffer(target, 0); + CV_CheckGlError(); } cv::GlBuffer::Impl::~Impl() { - try + if (autoRelease_ && bufId_) + gl::DeleteBuffers(1, &bufId_); +} + +void cv::GlBuffer::Impl::bind(GLenum target) const +{ + gl::BindBuffer(target, bufId_); + CV_CheckGlError(); +} + +void cv::GlBuffer::Impl::copyFrom(GLuint srcBuf, GLsizeiptr size) +{ + gl::BindBuffer(gl::COPY_WRITE_BUFFER, bufId_); + CV_CheckGlError(); + + gl::BindBuffer(gl::COPY_READ_BUFFER, srcBuf); + CV_CheckGlError(); + + gl::CopyBufferSubData(gl::COPY_READ_BUFFER, gl::COPY_WRITE_BUFFER, 0, 0, size); + CV_CheckGlError(); +} + +void cv::GlBuffer::Impl::copyFrom(GLsizeiptr size, const GLvoid* data) +{ + gl::BindBuffer(gl::COPY_WRITE_BUFFER, bufId_); + CV_CheckGlError(); + + gl::BufferSubData(gl::COPY_WRITE_BUFFER, 0, size, data); + CV_CheckGlError(); +} + +void cv::GlBuffer::Impl::copyTo(GLsizeiptr size, GLvoid* data) const +{ + gl::BindBuffer(gl::COPY_READ_BUFFER, bufId_); + CV_CheckGlError(); + + gl::GetBufferSubData(gl::COPY_READ_BUFFER, 0, size, data); + CV_CheckGlError(); +} + +void* cv::GlBuffer::Impl::mapHost(GLenum access) +{ + gl::BindBuffer(gl::COPY_READ_BUFFER, bufId_); + CV_CheckGlError(); + + GLvoid* data = gl::MapBuffer(gl::COPY_READ_BUFFER, access); + CV_CheckGlError(); + + return data; +} + +void cv::GlBuffer::Impl::unmapHost() +{ + gl::UnmapBuffer(gl::COPY_READ_BUFFER); +} + +#ifdef HAVE_CUDA + void cv::GlBuffer::Impl::copyFrom(const void* src, size_t spitch, size_t width, size_t height, cudaStream_t stream) { - if (buffer_) - glFuncTab()->deleteBuffers(1, &buffer_); + cudaResource_.registerBuffer(bufId_); + cudaResource_.copyFrom(src, spitch, width, height, stream); } -#ifdef _DEBUG - catch(const exception& e) + + void cv::GlBuffer::Impl::copyTo(void* dst, size_t dpitch, size_t width, size_t height, cudaStream_t stream) const { - cerr << e.what() << endl; + cudaResource_.registerBuffer(bufId_); + cudaResource_.copyTo(dst, dpitch, width, height, stream); + } + + void* cv::GlBuffer::Impl::mapDevice(cudaStream_t stream) + { + cudaResource_.registerBuffer(bufId_); + return cudaResource_.map(stream); + } + + void cv::GlBuffer::Impl::unmapDevice(cudaStream_t stream) + { + cudaResource_.unmap(stream); } #endif - catch(...) - { - } -} - -void cv::GlBuffer::Impl::copyFrom(const Mat& m, unsigned int target) -{ - CV_Assert(buffer_ != 0); - - CV_Assert(m.isContinuous()); - - bind(target); - - size_t size = m.rows * m.cols * m.elemSize(); - - glFuncTab()->bufferSubData(target, 0, size, m.data); - CV_CheckGlError(); - - unbind(target); -} - -#ifdef HAVE_CUDA - -void cv::GlBuffer::Impl::copyFrom(const GpuMat& mat, cudaStream_t stream) -{ - if (!g_isCudaGlDeviceInitialized) - cvError(CV_GpuApiCallError, "copyFrom", "cuda GL device wasn't initialized, call setGlDevice", __FILE__, __LINE__); - - CV_Assert(buffer_ != 0); - - cudaGlInterop_.copyFrom(mat, stream); -} - -#endif // HAVE_CUDA - -inline void cv::GlBuffer::Impl::bind(unsigned int target) const -{ - CV_Assert(buffer_ != 0); - - glFuncTab()->bindBuffer(target, buffer_); - CV_CheckGlError(); -} - -inline void cv::GlBuffer::Impl::unbind(unsigned int target) const -{ - glFuncTab()->bindBuffer(target, 0); -} - -inline Mat cv::GlBuffer::Impl::mapHost(int rows, int cols, int type, unsigned int target) -{ - void* ptr = glFuncTab()->mapBuffer(target, GL_READ_WRITE); - CV_CheckGlError(); - - return Mat(rows, cols, type, ptr); -} - -inline void cv::GlBuffer::Impl::unmapHost(unsigned int target) -{ - glFuncTab()->unmapBuffer(target); -} - -#ifdef HAVE_CUDA - -inline GpuMat cv::GlBuffer::Impl::mapDevice(int rows, int cols, int type, cudaStream_t stream) -{ - if (!g_isCudaGlDeviceInitialized) - cvError(CV_GpuApiCallError, "copyFrom", "cuda GL device wasn't initialized, call setGlDevice", __FILE__, __LINE__); - - CV_Assert(buffer_ != 0); - - return cudaGlInterop_.map(rows, cols, type, stream); -} - -inline void cv::GlBuffer::Impl::unmapDevice(cudaStream_t stream) -{ - if (!g_isCudaGlDeviceInitialized) - cvError(CV_GpuApiCallError, "copyFrom", "cuda GL device wasn't initialized, call setGlDevice", __FILE__, __LINE__); - - cudaGlInterop_.unmap(stream); -} - -#endif // HAVE_CUDA #endif // HAVE_OPENGL -cv::GlBuffer::GlBuffer(Usage _usage) : rows_(0), cols_(0), type_(0), usage_(_usage) +cv::GlBuffer::GlBuffer() : rows_(0), cols_(0), type_(0) { #ifndef HAVE_OPENGL - (void)_usage; - throw_nogl; + throw_nogl(); #else impl_ = Impl::empty(); #endif } -cv::GlBuffer::GlBuffer(int _rows, int _cols, int _type, Usage _usage) : rows_(0), cols_(0), type_(0), usage_(_usage) +cv::GlBuffer::GlBuffer(int arows, int acols, int atype, unsigned int abufId, bool autoRelease) : rows_(0), cols_(0), type_(0) { #ifndef HAVE_OPENGL - (void)_rows; - (void)_cols; - (void)_type; - (void)_usage; - throw_nogl; + (void) arows; + (void) acols; + (void) atype; + (void) abufId; + (void) autoRelease; + throw_nogl(); #else - impl_ = new Impl(_rows, _cols, _type, _usage); - rows_ = _rows; - cols_ = _cols; - type_ = _type; + impl_ = new Impl(abufId, autoRelease); + rows_ = arows; + cols_ = acols; + type_ = atype; #endif } -cv::GlBuffer::GlBuffer(Size _size, int _type, Usage _usage) : rows_(0), cols_(0), type_(0), usage_(_usage) +cv::GlBuffer::GlBuffer(Size asize, int atype, unsigned int abufId, bool autoRelease) : rows_(0), cols_(0), type_(0) { #ifndef HAVE_OPENGL - (void)_size; - (void)_type; - (void)_usage; - throw_nogl; + (void) asize; + (void) atype; + (void) abufId; + (void) autoRelease; + throw_nogl(); #else - impl_ = new Impl(_size.height, _size.width, _type, _usage); - rows_ = _size.height; - cols_ = _size.width; - type_ = _type; + impl_ = new Impl(abufId, autoRelease); + rows_ = asize.height; + cols_ = asize.width; + type_ = atype; #endif } -cv::GlBuffer::GlBuffer(InputArray mat_, Usage _usage) : rows_(0), cols_(0), type_(0), usage_(_usage) +cv::GlBuffer::GlBuffer(int arows, int acols, int atype, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0) +{ + create(arows, acols, atype, target, autoRelease); +} + +cv::GlBuffer::GlBuffer(Size asize, int atype, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0) +{ + create(asize, atype, target, autoRelease); +} + +cv::GlBuffer::GlBuffer(InputArray arr, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0) { #ifndef HAVE_OPENGL - (void)mat_; - (void)_usage; - throw_nogl; + (void) arr; + (void) target; + (void) autoRelease; + throw_nogl(); #else - int kind = mat_.kind(); - Size _size = mat_.size(); - int _type = mat_.type(); + const int kind = arr.kind(); - if (kind == _InputArray::GPU_MAT) + switch (kind) { - #if !defined HAVE_CUDA || defined(CUDA_DISABLER) - throw_nocuda; - #else - GpuMat d_mat = mat_.getGpuMat(); - impl_ = new Impl(d_mat.rows, d_mat.cols, d_mat.type(), _usage); - impl_->copyFrom(d_mat); - #endif + case _InputArray::OPENGL_BUFFER: + { + copyFrom(arr, target, autoRelease); + break; + } + + case _InputArray::OPENGL_TEXTURE2D: + { + copyFrom(arr, target, autoRelease); + break; + } + + case _InputArray::GPU_MAT: + { + copyFrom(arr, target, autoRelease); + break; + } + + default: + { + Mat mat = arr.getMat(); + CV_Assert( mat.isContinuous() ); + const GLsizeiptr asize = mat.rows * mat.cols * mat.elemSize(); + impl_ = new Impl(asize, mat.data, target, autoRelease); + rows_ = mat.rows; + cols_ = mat.cols; + type_ = mat.type(); + break; + } } - else - { - Mat mat = mat_.getMat(); - impl_ = new Impl(mat, _usage); - } - - rows_ = _size.height; - cols_ = _size.width; - type_ = _type; #endif } -void cv::GlBuffer::create(int _rows, int _cols, int _type, Usage _usage) +void cv::GlBuffer::create(int arows, int acols, int atype, Target target, bool autoRelease) { #ifndef HAVE_OPENGL - (void)_rows; - (void)_cols; - (void)_type; - (void)_usage; - throw_nogl; + (void) arows; + (void) acols; + (void) atype; + (void) target; + (void) autoRelease; + throw_nogl(); #else - if (rows_ != _rows || cols_ != _cols || type_ != _type || usage_ != _usage) + if (rows_ != arows || cols_ != acols || type_ != atype) { - impl_ = new Impl(_rows, _cols, _type, _usage); - rows_ = _rows; - cols_ = _cols; - type_ = _type; - usage_ = _usage; + const GLsizeiptr asize = arows * acols * CV_ELEM_SIZE(atype); + impl_ = new Impl(asize, 0, target, autoRelease); + rows_ = arows; + cols_ = acols; + type_ = atype; } #endif } void cv::GlBuffer::release() { -#ifndef HAVE_OPENGL - throw_nogl; -#else +#ifdef HAVE_OPENGL + if (*impl_.refcount == 1) + impl_->setAutoRelease(true); impl_ = Impl::empty(); + rows_ = 0; + cols_ = 0; + type_ = 0; #endif } -void cv::GlBuffer::copyFrom(InputArray mat_) +void cv::GlBuffer::setAutoRelease(bool flag) { #ifndef HAVE_OPENGL - (void)mat_; - throw_nogl; + (void) flag; + throw_nogl(); #else - int kind = mat_.kind(); - Size _size = mat_.size(); - int _type = mat_.type(); + impl_->setAutoRelease(flag); +#endif +} - create(_size, _type); +void cv::GlBuffer::copyFrom(InputArray arr, Target target, bool autoRelease) +{ +#ifndef HAVE_OPENGL + (void) arr; + (void) target; + (void) autoRelease; + throw_nogl(); +#else + const int kind = arr.kind(); + + if (kind == _InputArray::OPENGL_TEXTURE2D) + { + GlTexture2D tex = arr.getGlTexture2D(); + tex.copyTo(*this); + setAutoRelease(autoRelease); + return; + } + + const Size asize = arr.size(); + const int atype = arr.type(); + create(asize, atype, target, autoRelease); switch (kind) { case _InputArray::OPENGL_BUFFER: { - GlBuffer buf = mat_.getGlBuffer(); - *this = buf; + GlBuffer buf = arr.getGlBuffer(); + impl_->copyFrom(buf.bufId(), asize.area() * CV_ELEM_SIZE(atype)); break; } + case _InputArray::GPU_MAT: { #if !defined HAVE_CUDA || defined(CUDA_DISABLER) - throw_nocuda; + throw_nocuda(); #else - GpuMat d_mat = mat_.getGpuMat(); - impl_->copyFrom(d_mat); + GpuMat dmat = arr.getGpuMat(); + impl_->copyFrom(dmat.data, dmat.step, dmat.cols * dmat.elemSize(), dmat.rows); #endif break; } + default: { - Mat mat = mat_.getMat(); - impl_->copyFrom(mat, usage_); + Mat mat = arr.getMat(); + CV_Assert( mat.isContinuous() ); + impl_->copyFrom(asize.area() * CV_ELEM_SIZE(atype), mat.data); } } #endif } -void cv::GlBuffer::bind() const +void cv::GlBuffer::copyTo(OutputArray arr, Target target, bool autoRelease) const { #ifndef HAVE_OPENGL - throw_nogl; + (void) arr; + (void) target; + (void) autoRelease; + throw_nogl(); #else - impl_->bind(usage_); + const int kind = arr.kind(); + + switch (kind) + { + case _InputArray::OPENGL_BUFFER: + { + arr.getGlBufferRef().copyFrom(*this, target, autoRelease); + break; + } + + case _InputArray::OPENGL_TEXTURE2D: + { + arr.getGlTexture2DRef().copyFrom(*this, autoRelease); + break; + } + + case _InputArray::GPU_MAT: + { + #if !defined HAVE_CUDA || defined(CUDA_DISABLER) + throw_nocuda(); + #else + GpuMat& dmat = arr.getGpuMatRef(); + dmat.create(rows_, cols_, type_); + impl_->copyTo(dmat.data, dmat.step, dmat.cols * dmat.elemSize(), dmat.rows); + #endif + + break; + } + + default: + { + arr.create(rows_, cols_, type_); + Mat mat = arr.getMat(); + CV_Assert( mat.isContinuous() ); + impl_->copyTo(mat.rows * mat.cols * mat.elemSize(), mat.data); + } + } #endif } -void cv::GlBuffer::unbind() const +GlBuffer cv::GlBuffer::clone(Target target, bool autoRelease) const { #ifndef HAVE_OPENGL - throw_nogl; + (void) target; + (void) autoRelease; + throw_nogl(); + return GlBuffer(); #else - impl_->unbind(usage_); + GlBuffer buf; + buf.copyFrom(*this, target, autoRelease); + return buf; #endif } -Mat cv::GlBuffer::mapHost() +void cv::GlBuffer::bind(Target target) const { #ifndef HAVE_OPENGL - throw_nogl; + (void) target; + throw_nogl(); +#else + impl_->bind(target); +#endif +} + +void cv::GlBuffer::unbind(Target target) +{ +#ifndef HAVE_OPENGL + (void) target; + throw_nogl(); +#else + gl::BindBuffer(target, 0); + CV_CheckGlError(); +#endif +} + +Mat cv::GlBuffer::mapHost(Access access) +{ +#ifndef HAVE_OPENGL + (void) access; + throw_nogl(); return Mat(); #else - return impl_->mapHost(rows_, cols_, type_, usage_); + return Mat(rows_, cols_, type_, impl_->mapHost(access)); #endif } void cv::GlBuffer::unmapHost() { #ifndef HAVE_OPENGL - throw_nogl; + throw_nogl(); #else - impl_->unmapHost(usage_); + return impl_->unmapHost(); #endif } GpuMat cv::GlBuffer::mapDevice() { #ifndef HAVE_OPENGL - throw_nogl; + throw_nogl(); return GpuMat(); #else #if !defined HAVE_CUDA || defined(CUDA_DISABLER) - throw_nocuda; + throw_nocuda(); return GpuMat(); #else - return impl_->mapDevice(rows_, cols_, type_); + return GpuMat(rows_, cols_, type_, impl_->mapDevice()); #endif #endif } @@ -690,418 +793,443 @@ GpuMat cv::GlBuffer::mapDevice() void cv::GlBuffer::unmapDevice() { #ifndef HAVE_OPENGL - throw_nogl; + throw_nogl(); #else #if !defined HAVE_CUDA || defined(CUDA_DISABLER) - throw_nocuda; + throw_nocuda(); #else impl_->unmapDevice(); #endif #endif } +unsigned int cv::GlBuffer::bufId() const +{ +#ifndef HAVE_OPENGL + throw_nogl(); + return 0; +#else + return impl_->bufId(); +#endif +} + template <> void cv::Ptr::delete_obj() { if (obj) delete obj; } ////////////////////////////////////////////////////////////////////////////////////////// -// GlTexture +// GlTexture2D #ifndef HAVE_OPENGL -class cv::GlTexture::Impl +class cv::GlTexture2D::Impl { }; #else -class cv::GlTexture::Impl +class cv::GlTexture2D::Impl { public: static const Ptr empty(); - Impl(int rows, int cols, int type); - - Impl(const Mat& mat, bool bgra); - Impl(const GlBuffer& buf, bool bgra); - + Impl(GLuint texId, bool autoRelease); + Impl(GLint internalFormat, GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid* pixels, bool autoRelease); ~Impl(); - void copyFrom(const Mat& mat, bool bgra); - void copyFrom(const GlBuffer& buf, bool bgra); + void copyFrom(GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid *pixels); + void copyTo(GLenum format, GLenum type, GLvoid* pixels) const; void bind() const; - void unbind() const; + + void setAutoRelease(bool flag) { autoRelease_ = flag; } + + GLuint texId() const { return texId_; } private: Impl(); - GLuint tex_; + GLuint texId_; + bool autoRelease_; }; -inline const Ptr cv::GlTexture::Impl::empty() +const Ptr cv::GlTexture2D::Impl::empty() { static Ptr p(new Impl); return p; } -inline cv::GlTexture::Impl::Impl() : tex_(0) +cv::GlTexture2D::Impl::Impl() : texId_(0), autoRelease_(true) { } -cv::GlTexture::Impl::Impl(int rows, int cols, int type) : tex_(0) +cv::GlTexture2D::Impl::Impl(GLuint atexId, bool autoRelease) : texId_(atexId), autoRelease_(autoRelease) { - if (!glFuncTab()->isGlContextInitialized()) - throw_nogl; +} - int depth = CV_MAT_DEPTH(type); - int cn = CV_MAT_CN(type); - - CV_DbgAssert(rows > 0 && cols > 0); - CV_Assert(cn == 1 || cn == 3 || cn == 4); - CV_Assert(depth >= 0 && depth <= CV_32F); - - glGenTextures(1, &tex_); - CV_CheckGlError(); - CV_Assert(tex_ != 0); - - glBindTexture(GL_TEXTURE_2D, tex_); +cv::GlTexture2D::Impl::Impl(GLint internalFormat, GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid* pixels, bool autoRelease) : texId_(0), autoRelease_(autoRelease) +{ + gl::GenTextures(1, &texId_); CV_CheckGlError(); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + CV_Assert(texId_ != 0); + + gl::BindTexture(gl::TEXTURE_2D, texId_); CV_CheckGlError(); - GLenum format = cn == 1 ? GL_LUMINANCE : cn == 3 ? GL_BGR : GL_BGRA; - - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + gl::PixelStorei(gl::UNPACK_ALIGNMENT, 1); CV_CheckGlError(); - glTexImage2D(GL_TEXTURE_2D, 0, cn, cols, rows, 0, format, gl_types[depth], 0); + gl::TexImage2D(gl::TEXTURE_2D, 0, internalFormat, width, height, 0, format, type, pixels); + CV_CheckGlError(); + + gl::GenerateMipmap(gl::TEXTURE_2D); CV_CheckGlError(); } -cv::GlTexture::Impl::Impl(const Mat& mat, bool bgra) : tex_(0) +cv::GlTexture2D::Impl::~Impl() { - if (!glFuncTab()->isGlContextInitialized()) - throw_nogl; + if (autoRelease_ && texId_) + gl::DeleteTextures(1, &texId_); +} - int depth = mat.depth(); - int cn = mat.channels(); - - CV_DbgAssert(mat.rows > 0 && mat.cols > 0); - CV_Assert(cn == 1 || cn == 3 || cn == 4); - CV_Assert(depth >= 0 && depth <= CV_32F); - CV_Assert(mat.isContinuous()); - - glGenTextures(1, &tex_); - CV_CheckGlError(); - CV_Assert(tex_ != 0); - - glBindTexture(GL_TEXTURE_2D, tex_); +void cv::GlTexture2D::Impl::copyFrom(GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid *pixels) +{ + gl::BindTexture(gl::TEXTURE_2D, texId_); CV_CheckGlError(); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + gl::PixelStorei(gl::UNPACK_ALIGNMENT, 1); CV_CheckGlError(); - GLenum format = cn == 1 ? GL_LUMINANCE : (cn == 3 ? (bgra ? GL_BGR : GL_RGB) : (bgra ? GL_BGRA : GL_RGBA)); - - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + gl::TexSubImage2D(gl::TEXTURE_2D, 0, 0, 0, width, height, format, type, pixels); CV_CheckGlError(); - glTexImage2D(GL_TEXTURE_2D, 0, cn, mat.cols, mat.rows, 0, format, gl_types[depth], mat.data); + gl::GenerateMipmap(gl::TEXTURE_2D); CV_CheckGlError(); } -cv::GlTexture::Impl::Impl(const GlBuffer& buf, bool bgra) : tex_(0) +void cv::GlTexture2D::Impl::copyTo(GLenum format, GLenum type, GLvoid* pixels) const { - if (!glFuncTab()->isGlContextInitialized()) - throw_nogl; - - int depth = buf.depth(); - int cn = buf.channels(); - - CV_DbgAssert(buf.rows() > 0 && buf.cols() > 0); - CV_Assert(cn == 1 || cn == 3 || cn == 4); - CV_Assert(depth >= 0 && depth <= CV_32F); - CV_Assert(buf.usage() == GlBuffer::TEXTURE_BUFFER); - - glGenTextures(1, &tex_); - CV_CheckGlError(); - CV_Assert(tex_ != 0); - - glBindTexture(GL_TEXTURE_2D, tex_); + gl::BindTexture(gl::TEXTURE_2D, texId_); CV_CheckGlError(); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + gl::PixelStorei(gl::PACK_ALIGNMENT, 1); CV_CheckGlError(); - GLenum format = cn == 1 ? GL_LUMINANCE : (cn == 3 ? (bgra ? GL_BGR : GL_RGB) : (bgra ? GL_BGRA : GL_RGBA)); - - buf.bind(); - - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - CV_CheckGlError(); - - glTexImage2D(GL_TEXTURE_2D, 0, cn, buf.cols(), buf.rows(), 0, format, gl_types[depth], 0); - CV_CheckGlError(); - - buf.unbind(); -} - -inline cv::GlTexture::Impl::~Impl() -{ - if (tex_) - glDeleteTextures(1, &tex_); -} - -void cv::GlTexture::Impl::copyFrom(const Mat& mat, bool bgra) -{ - CV_Assert(tex_ != 0); - - bind(); - - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - CV_CheckGlError(); - - int cn = mat.channels(); - GLenum format = cn == 1 ? GL_LUMINANCE : (cn == 3 ? (bgra ? GL_BGR : GL_RGB) : (bgra ? GL_BGRA : GL_RGBA)); - - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, mat.cols, mat.rows, format, gl_types[mat.depth()], mat.data); - CV_CheckGlError(); - - unbind(); -} - -void cv::GlTexture::Impl::copyFrom(const GlBuffer& buf, bool bgra) -{ - CV_Assert(tex_ != 0); - CV_Assert(buf.usage() == GlBuffer::TEXTURE_BUFFER); - - bind(); - - buf.bind(); - - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - CV_CheckGlError(); - - int cn = buf.channels(); - GLenum format = cn == 1 ? GL_LUMINANCE : (cn == 3 ? (bgra ? GL_BGR : GL_RGB) : (bgra ? GL_BGRA : GL_RGBA)); - - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, buf.cols(), buf.rows(), format, gl_types[buf.depth()], 0); - CV_CheckGlError(); - - buf.unbind(); - - unbind(); -} - -inline void cv::GlTexture::Impl::bind() const -{ - CV_Assert(tex_ != 0); - - glEnable(GL_TEXTURE_2D); - CV_CheckGlError(); - - glBindTexture(GL_TEXTURE_2D, tex_); + gl::GetTexImage(gl::TEXTURE_2D, 0, format, type, pixels); CV_CheckGlError(); } -inline void cv::GlTexture::Impl::unbind() const +void cv::GlTexture2D::Impl::bind() const { - glBindTexture(GL_TEXTURE_2D, 0); - - glDisable(GL_TEXTURE_2D); + gl::BindTexture(gl::TEXTURE_2D, texId_); + CV_CheckGlError(); } #endif // HAVE_OPENGL -cv::GlTexture::GlTexture() : rows_(0), cols_(0), type_(0), buf_(GlBuffer::TEXTURE_BUFFER) +cv::GlTexture2D::GlTexture2D() : rows_(0), cols_(0), format_(NONE) { #ifndef HAVE_OPENGL - throw_nogl; + throw_nogl(); #else impl_ = Impl::empty(); #endif } -cv::GlTexture::GlTexture(int _rows, int _cols, int _type) : rows_(0), cols_(0), type_(0), buf_(GlBuffer::TEXTURE_BUFFER) +cv::GlTexture2D::GlTexture2D(int arows, int acols, Format aformat, unsigned int atexId, bool autoRelease) : rows_(0), cols_(0), format_(NONE) { #ifndef HAVE_OPENGL - (void)_rows; - (void)_cols; - (void)_type; - throw_nogl; + (void) arows; + (void) acols; + (void) aformat; + (void) atexId; + (void) autoRelease; + throw_nogl(); #else - impl_ = new Impl(_rows, _cols, _type); - rows_ = _rows; - cols_ = _cols; - type_ = _type; + impl_ = new Impl(atexId, autoRelease); + rows_ = arows; + cols_ = acols; + format_ = aformat; #endif } -cv::GlTexture::GlTexture(Size _size, int _type) : rows_(0), cols_(0), type_(0), buf_(GlBuffer::TEXTURE_BUFFER) +cv::GlTexture2D::GlTexture2D(Size asize, Format aformat, unsigned int atexId, bool autoRelease) : rows_(0), cols_(0), format_(NONE) { #ifndef HAVE_OPENGL - (void)_size; - (void)_type; - throw_nogl; + (void) asize; + (void) aformat; + (void) atexId; + (void) autoRelease; + throw_nogl(); #else - impl_ = new Impl(_size.height, _size.width, _type); - rows_ = _size.height; - cols_ = _size.width; - type_ = _type; + impl_ = new Impl(atexId, autoRelease); + rows_ = asize.height; + cols_ = asize.width; + format_ = aformat; #endif } -cv::GlTexture::GlTexture(InputArray mat_, bool bgra) : rows_(0), cols_(0), type_(0), buf_(GlBuffer::TEXTURE_BUFFER) +cv::GlTexture2D::GlTexture2D(int arows, int acols, Format aformat, bool autoRelease) : rows_(0), cols_(0), format_(NONE) +{ + create(arows, acols, aformat, autoRelease); +} + +cv::GlTexture2D::GlTexture2D(Size asize, Format aformat, bool autoRelease) : rows_(0), cols_(0), format_(NONE) +{ + create(asize, aformat, autoRelease); +} + +cv::GlTexture2D::GlTexture2D(InputArray arr, bool autoRelease) : rows_(0), cols_(0), format_(NONE) { #ifndef HAVE_OPENGL - (void)mat_; - (void)bgra; - throw_nogl; + (void) arr; + (void) autoRelease; + throw_nogl(); #else - int kind = mat_.kind(); - Size _size = mat_.size(); - int _type = mat_.type(); + const int kind = arr.kind(); + + const Size asize = arr.size(); + const int atype = arr.type(); + + const int depth = CV_MAT_DEPTH(atype); + const int cn = CV_MAT_CN(atype); + + CV_Assert( depth <= CV_32F ); + CV_Assert( cn == 1 || cn == 3 || cn == 4 ); + + const Format internalFormats[] = + { + NONE, DEPTH_COMPONENT, NONE, RGB, RGBA + }; + const GLenum srcFormats[] = + { + 0, gl::DEPTH_COMPONENT, 0, gl::BGR, gl::BGRA + }; switch (kind) { case _InputArray::OPENGL_BUFFER: { - GlBuffer buf = mat_.getGlBuffer(); - impl_ = new Impl(buf, bgra); + GlBuffer buf = arr.getGlBuffer(); + buf.bind(GlBuffer::PIXEL_UNPACK_BUFFER); + impl_ = new Impl(internalFormats[cn], asize.width, asize.height, srcFormats[cn], gl_types[depth], 0, autoRelease); + GlBuffer::unbind(GlBuffer::PIXEL_UNPACK_BUFFER); break; } + case _InputArray::GPU_MAT: { #if !defined HAVE_CUDA || defined(CUDA_DISABLER) - throw_nocuda; + throw_nocuda(); #else - GpuMat d_mat = mat_.getGpuMat(); - GlBuffer buf(d_mat, GlBuffer::TEXTURE_BUFFER); - impl_ = new Impl(buf, bgra); + GpuMat dmat = arr.getGpuMat(); + GlBuffer buf(dmat, GlBuffer::PIXEL_UNPACK_BUFFER); + buf.bind(GlBuffer::PIXEL_UNPACK_BUFFER); + impl_ = new Impl(internalFormats[cn], asize.width, asize.height, srcFormats[cn], gl_types[depth], 0, autoRelease); + GlBuffer::unbind(GlBuffer::PIXEL_UNPACK_BUFFER); #endif break; } + default: { - Mat mat = mat_.getMat(); - impl_ = new Impl(mat, bgra); + Mat mat = arr.getMat(); + CV_Assert( mat.isContinuous() ); + GlBuffer::unbind(GlBuffer::PIXEL_UNPACK_BUFFER); + impl_ = new Impl(internalFormats[cn], asize.width, asize.height, srcFormats[cn], gl_types[depth], mat.data, autoRelease); break; } } - rows_ = _size.height; - cols_ = _size.width; - type_ = _type; + rows_ = asize.height; + cols_ = asize.width; + format_ = internalFormats[cn]; #endif } -void cv::GlTexture::create(int _rows, int _cols, int _type) +void cv::GlTexture2D::create(int arows, int acols, Format aformat, bool autoRelease) { #ifndef HAVE_OPENGL - (void)_rows; - (void)_cols; - (void)_type; - throw_nogl; + (void) arows; + (void) acols; + (void) aformat; + (void) autoRelease; + throw_nogl(); #else - if (rows_ != _rows || cols_ != _cols || type_ != _type) + if (rows_ != arows || cols_ != acols || format_ != aformat) { - impl_ = new Impl(_rows, _cols, _type); - rows_ = _rows; - cols_ = _cols; - type_ = _type; + GlBuffer::unbind(GlBuffer::PIXEL_UNPACK_BUFFER); + impl_ = new Impl(aformat, acols, arows, aformat, gl::FLOAT, 0, autoRelease); + rows_ = arows; + cols_ = acols; + format_ = aformat; } #endif } -void cv::GlTexture::release() +void cv::GlTexture2D::release() { -#ifndef HAVE_OPENGL - throw_nogl; -#else +#ifdef HAVE_OPENGL + if (*impl_.refcount == 1) + impl_->setAutoRelease(true); impl_ = Impl::empty(); + rows_ = 0; + cols_ = 0; + format_ = NONE; #endif } -void cv::GlTexture::copyFrom(InputArray mat_, bool bgra) +void cv::GlTexture2D::setAutoRelease(bool flag) { #ifndef HAVE_OPENGL - (void)mat_; - (void)bgra; - throw_nogl; + (void) flag; + throw_nogl(); #else - int kind = mat_.kind(); - Size _size = mat_.size(); - int _type = mat_.type(); + impl_->setAutoRelease(flag); +#endif +} - create(_size, _type); +void cv::GlTexture2D::copyFrom(InputArray arr, bool autoRelease) +{ +#ifndef HAVE_OPENGL + (void) arr; + (void) autoRelease; + throw_nogl(); +#else + const int kind = arr.kind(); + + const Size asize = arr.size(); + const int atype = arr.type(); + + const int depth = CV_MAT_DEPTH(atype); + const int cn = CV_MAT_CN(atype); + + CV_Assert( depth <= CV_32F ); + CV_Assert( cn == 1 || cn == 3 || cn == 4 ); + + const Format internalFormats[] = + { + NONE, DEPTH_COMPONENT, NONE, RGB, RGBA + }; + const GLenum srcFormats[] = + { + 0, gl::DEPTH_COMPONENT, 0, gl::BGR, gl::BGRA + }; + + create(asize, internalFormats[cn], autoRelease); switch(kind) { - case _InputArray::OPENGL_TEXTURE: - { - GlTexture tex = mat_.getGlTexture(); - *this = tex; - break; - } case _InputArray::OPENGL_BUFFER: { - GlBuffer buf = mat_.getGlBuffer(); - impl_->copyFrom(buf, bgra); + GlBuffer buf = arr.getGlBuffer(); + buf.bind(GlBuffer::PIXEL_UNPACK_BUFFER); + impl_->copyFrom(asize.width, asize.height, srcFormats[cn], gl_types[depth], 0); + GlBuffer::unbind(GlBuffer::PIXEL_UNPACK_BUFFER); break; } + case _InputArray::GPU_MAT: { #if !defined HAVE_CUDA || defined(CUDA_DISABLER) - throw_nocuda; + throw_nocuda(); #else - GpuMat d_mat = mat_.getGpuMat(); - buf_.copyFrom(d_mat); - impl_->copyFrom(buf_, bgra); + GpuMat dmat = arr.getGpuMat(); + GlBuffer buf(dmat, GlBuffer::PIXEL_UNPACK_BUFFER); + buf.bind(GlBuffer::PIXEL_UNPACK_BUFFER); + impl_->copyFrom(asize.width, asize.height, srcFormats[cn], gl_types[depth], 0); + GlBuffer::unbind(GlBuffer::PIXEL_UNPACK_BUFFER); #endif break; } + default: { - Mat mat = mat_.getMat(); - impl_->copyFrom(mat, bgra); + Mat mat = arr.getMat(); + CV_Assert( mat.isContinuous() ); + GlBuffer::unbind(GlBuffer::PIXEL_UNPACK_BUFFER); + impl_->copyFrom(asize.width, asize.height, srcFormats[cn], gl_types[depth], mat.data); } } #endif } -void cv::GlTexture::bind() const +void cv::GlTexture2D::copyTo(OutputArray arr, int ddepth, bool autoRelease) const { #ifndef HAVE_OPENGL - throw_nogl; + (void) arr; + (void) ddepth; + (void) autoRelease; + throw_nogl(); +#else + const int kind = arr.kind(); + + const int cn = format_ == DEPTH_COMPONENT ? 1: format_ == RGB ? 3 : 4; + const GLenum dstFormat = format_ == DEPTH_COMPONENT ? gl::DEPTH_COMPONENT : format_ == RGB ? gl::BGR : gl::BGRA; + + switch(kind) + { + case _InputArray::OPENGL_BUFFER: + { + GlBuffer& buf = arr.getGlBufferRef(); + buf.create(rows_, cols_, CV_MAKE_TYPE(ddepth, cn), GlBuffer::PIXEL_PACK_BUFFER, autoRelease); + buf.bind(GlBuffer::PIXEL_PACK_BUFFER); + impl_->copyTo(dstFormat, gl_types[ddepth], 0); + GlBuffer::unbind(GlBuffer::PIXEL_PACK_BUFFER); + break; + } + + case _InputArray::GPU_MAT: + { + #if !defined HAVE_CUDA || defined(CUDA_DISABLER) + throw_nocuda(); + #else + GlBuffer buf(rows_, cols_, CV_MAKE_TYPE(ddepth, cn), GlBuffer::PIXEL_PACK_BUFFER); + buf.bind(GlBuffer::PIXEL_PACK_BUFFER); + impl_->copyTo(dstFormat, gl_types[ddepth], 0); + GlBuffer::unbind(GlBuffer::PIXEL_PACK_BUFFER); + buf.copyTo(arr); + #endif + + break; + } + + default: + { + arr.create(rows_, cols_, CV_MAKE_TYPE(ddepth, cn)); + Mat mat = arr.getMat(); + CV_Assert( mat.isContinuous() ); + GlBuffer::unbind(GlBuffer::PIXEL_PACK_BUFFER); + impl_->copyTo(dstFormat, gl_types[ddepth], mat.data); + } + } +#endif +} + +void cv::GlTexture2D::bind() const +{ +#ifndef HAVE_OPENGL + throw_nogl(); #else impl_->bind(); #endif } -void cv::GlTexture::unbind() const +unsigned int cv::GlTexture2D::texId() const { #ifndef HAVE_OPENGL - throw_nogl; + throw_nogl(); + return 0; #else - impl_->unbind(); + return impl_->texId(); #endif } -template <> void cv::Ptr::delete_obj() +template <> void cv::Ptr::delete_obj() { if (obj) delete obj; } @@ -1109,266 +1237,253 @@ template <> void cv::Ptr::delete_obj() //////////////////////////////////////////////////////////////////////// // GlArrays -void cv::GlArrays::setVertexArray(InputArray vertex) +cv::GlArrays::GlArrays() : size_(0) { - int cn = vertex.channels(); - int depth = vertex.depth(); - - CV_Assert(cn == 2 || cn == 3 || cn == 4); - CV_Assert(depth == CV_16S || depth == CV_32S || depth == CV_32F || depth == CV_64F); - - vertex_.copyFrom(vertex); } -void cv::GlArrays::setColorArray(InputArray color, bool bgra) +void cv::GlArrays::setVertexArray(InputArray vertex) { - int cn = color.channels(); + const int cn = vertex.channels(); + const int depth = vertex.depth(); - CV_Assert((cn == 3 && !bgra) || cn == 4); + CV_Assert( cn == 2 || cn == 3 || cn == 4 ); + CV_Assert( depth == CV_16S || depth == CV_32S || depth == CV_32F || depth == CV_64F ); - color_.copyFrom(color); - bgra_ = bgra; + if (vertex.kind() == _InputArray::OPENGL_BUFFER) + vertex_ = vertex.getGlBuffer(); + else + vertex_.copyFrom(vertex); + + size_ = vertex_.size().area(); +} + +void cv::GlArrays::resetVertexArray() +{ + vertex_.release(); + size_ = 0; +} + +void cv::GlArrays::setColorArray(InputArray color) +{ + const int cn = color.channels(); + + CV_Assert( cn == 3 || cn == 4 ); + + if (color.kind() == _InputArray::OPENGL_BUFFER) + color_ = color.getGlBuffer(); + else + color_.copyFrom(color); +} + +void cv::GlArrays::resetColorArray() +{ + color_.release(); } void cv::GlArrays::setNormalArray(InputArray normal) { - int cn = normal.channels(); - int depth = normal.depth(); + const int cn = normal.channels(); + const int depth = normal.depth(); - CV_Assert(cn == 3); - CV_Assert(depth == CV_8S || depth == CV_16S || depth == CV_32S || depth == CV_32F || depth == CV_64F); + CV_Assert( cn == 3 ); + CV_Assert( depth == CV_8S || depth == CV_16S || depth == CV_32S || depth == CV_32F || depth == CV_64F ); - normal_.copyFrom(normal); + if (normal.kind() == _InputArray::OPENGL_BUFFER) + normal_ = normal.getGlBuffer(); + else + normal_.copyFrom(normal); +} + +void cv::GlArrays::resetNormalArray() +{ + normal_.release(); } void cv::GlArrays::setTexCoordArray(InputArray texCoord) { - int cn = texCoord.channels(); - int depth = texCoord.depth(); + const int cn = texCoord.channels(); + const int depth = texCoord.depth(); - CV_Assert(cn >= 1 && cn <= 4); - CV_Assert(depth == CV_16S || depth == CV_32S || depth == CV_32F || depth == CV_64F); + CV_Assert( cn >= 1 && cn <= 4 ); + CV_Assert( depth == CV_16S || depth == CV_32S || depth == CV_32F || depth == CV_64F ); - texCoord_.copyFrom(texCoord); + if (texCoord.kind() == _InputArray::OPENGL_BUFFER) + texCoord_ = texCoord.getGlBuffer(); + else + texCoord_.copyFrom(texCoord); +} + +void cv::GlArrays::resetTexCoordArray() +{ + texCoord_.release(); +} + +void cv::GlArrays::release() +{ + resetVertexArray(); + resetColorArray(); + resetNormalArray(); + resetTexCoordArray(); +} + +void cv::GlArrays::setAutoRelease(bool flag) +{ + vertex_.setAutoRelease(flag); + color_.setAutoRelease(flag); + normal_.setAutoRelease(flag); + texCoord_.setAutoRelease(flag); } void cv::GlArrays::bind() const { #ifndef HAVE_OPENGL - throw_nogl; + throw_nogl(); #else - CV_DbgAssert(texCoord_.empty() || texCoord_.size().area() == vertex_.size().area()); - CV_DbgAssert(normal_.empty() || normal_.size().area() == vertex_.size().area()); - CV_DbgAssert(color_.empty() || color_.size().area() == vertex_.size().area()); + CV_Assert( texCoord_.empty() || texCoord_.size().area() == size_ ); + CV_Assert( normal_.empty() || normal_.size().area() == size_ ); + CV_Assert( color_.empty() || color_.size().area() == size_ ); - if (!texCoord_.empty()) + if (texCoord_.empty()) { - glEnableClientState(GL_TEXTURE_COORD_ARRAY); + gl::DisableClientState(gl::TEXTURE_COORD_ARRAY); CV_CheckGlError(); - - texCoord_.bind(); - - glTexCoordPointer(texCoord_.channels(), gl_types[texCoord_.depth()], 0, 0); - CV_CheckGlError(); - - texCoord_.unbind(); } - - if (!normal_.empty()) + else { - glEnableClientState(GL_NORMAL_ARRAY); + gl::EnableClientState(gl::TEXTURE_COORD_ARRAY); CV_CheckGlError(); - normal_.bind(); + texCoord_.bind(GlBuffer::ARRAY_BUFFER); - glNormalPointer(gl_types[normal_.depth()], 0, 0); - CV_CheckGlError(); - - normal_.unbind(); - } - - if (!color_.empty()) - { - glEnableClientState(GL_COLOR_ARRAY); - CV_CheckGlError(); - - color_.bind(); - - int cn = color_.channels(); - int format = cn == 3 ? cn : (bgra_ ? GL_BGRA : 4); - - glColorPointer(format, gl_types[color_.depth()], 0, 0); - CV_CheckGlError(); - - color_.unbind(); - } - - if (!vertex_.empty()) - { - glEnableClientState(GL_VERTEX_ARRAY); - CV_CheckGlError(); - - vertex_.bind(); - - glVertexPointer(vertex_.channels(), gl_types[vertex_.depth()], 0, 0); - CV_CheckGlError(); - - vertex_.unbind(); - } -#endif -} - -void cv::GlArrays::unbind() const -{ -#ifndef HAVE_OPENGL - throw_nogl; -#else - if (!texCoord_.empty()) - { - glDisableClientState(GL_TEXTURE_COORD_ARRAY); + gl::TexCoordPointer(texCoord_.channels(), gl_types[texCoord_.depth()], 0, 0); CV_CheckGlError(); } - if (!normal_.empty()) + if (normal_.empty()) { - glDisableClientState(GL_NORMAL_ARRAY); + gl::DisableClientState(gl::NORMAL_ARRAY); + CV_CheckGlError(); + } + else + { + gl::EnableClientState(gl::NORMAL_ARRAY); + CV_CheckGlError(); + + normal_.bind(GlBuffer::ARRAY_BUFFER); + + gl::NormalPointer(gl_types[normal_.depth()], 0, 0); CV_CheckGlError(); } - if (!color_.empty()) + if (color_.empty()) { - glDisableClientState(GL_COLOR_ARRAY); + gl::DisableClientState(gl::COLOR_ARRAY); + CV_CheckGlError(); + } + else + { + gl::EnableClientState(gl::COLOR_ARRAY); + CV_CheckGlError(); + + color_.bind(GlBuffer::ARRAY_BUFFER); + + const int cn = color_.channels(); + + gl::ColorPointer(cn, gl_types[color_.depth()], 0, 0); CV_CheckGlError(); } - if (!vertex_.empty()) + if (vertex_.empty()) { - glDisableClientState(GL_VERTEX_ARRAY); + gl::DisableClientState(gl::VERTEX_ARRAY); CV_CheckGlError(); } -#endif -} - -//////////////////////////////////////////////////////////////////////// -// GlFont - -cv::GlFont::GlFont(const string& _family, int _height, Weight _weight, Style _style) - : family_(_family), height_(_height), weight_(_weight), style_(_style), base_(0) -{ -#ifndef HAVE_OPENGL - throw_nogl; -#else - base_ = glGenLists(256); - CV_CheckGlError(); - - glFuncTab()->generateBitmapFont(family_, height_, weight_, (style_ & STYLE_ITALIC) != 0, (style_ & STYLE_UNDERLINE) != 0, 0, 256, base_); -#endif -} - -void cv::GlFont::draw(const char* str, size_t len) const -{ -#ifndef HAVE_OPENGL - (void)str; - (void)len; - throw_nogl; -#else - if (base_ && len > 0) + else { - glPushAttrib(GL_LIST_BIT); - glListBase(base_); + gl::EnableClientState(gl::VERTEX_ARRAY); + CV_CheckGlError(); - glCallLists(static_cast(len), GL_UNSIGNED_BYTE, str); - - glPopAttrib(); + vertex_.bind(GlBuffer::ARRAY_BUFFER); + gl::VertexPointer(vertex_.channels(), gl_types[vertex_.depth()], 0, 0); CV_CheckGlError(); } -#endif -} -namespace -{ - class FontCompare : public unary_function, bool> - { - public: - inline FontCompare(const string& family, int height, GlFont::Weight weight, GlFont::Style style) - : family_(family), height_(height), weight_(weight), style_(style) - { - } - - bool operator ()(const cv::Ptr& font) - { - return font->family() == family_ && font->height() == height_ && font->weight() == weight_ && font->style() == style_; - } - - private: - string family_; - int height_; - GlFont::Weight weight_; - GlFont::Style style_; - }; -} - -Ptr cv::GlFont::get(const std::string& family, int height, Weight weight, Style style) -{ -#ifndef HAVE_OPENGL - (void)family; - (void)height; - (void)weight; - (void)style; - throw_nogl; - return Ptr(); -#else - static vector< Ptr > fonts; - fonts.reserve(10); - - vector< Ptr >::iterator fontIt = find_if(fonts.begin(), fonts.end(), FontCompare(family, height, weight, style)); - - if (fontIt == fonts.end()) - { - fonts.push_back(new GlFont(family, height, weight, style)); - - fontIt = fonts.end() - 1; - } - - return *fontIt; + GlBuffer::unbind(GlBuffer::ARRAY_BUFFER); #endif } //////////////////////////////////////////////////////////////////////// // Rendering -void cv::render(const GlTexture& tex, Rect_ wndRect, Rect_ texRect) +void cv::render(const GlTexture2D& tex, Rect_ wndRect, Rect_ texRect) { #ifndef HAVE_OPENGL - (void)tex; - (void)wndRect; - (void)texRect; - throw_nogl; + (void) tex; + (void) wndRect; + (void) texRect; + throw_nogl(); #else if (!tex.empty()) { - tex.bind(); - - glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE); - - glBegin(GL_QUADS); - glTexCoord2d(texRect.x, texRect.y); - glVertex2d(wndRect.x, wndRect.y); - - glTexCoord2d(texRect.x, texRect.y + texRect.height); - glVertex2d(wndRect.x, (wndRect.y + wndRect.height)); - - glTexCoord2d(texRect.x + texRect.width, texRect.y + texRect.height); - glVertex2d(wndRect.x + wndRect.width, (wndRect.y + wndRect.height)); - - glTexCoord2d(texRect.x + texRect.width, texRect.y); - glVertex2d(wndRect.x + wndRect.width, wndRect.y); - glEnd(); - + gl::MatrixMode(gl::PROJECTION); + gl::LoadIdentity(); + gl::Ortho(0.0, 1.0, 1.0, 0.0, -1.0, 1.0); CV_CheckGlError(); - tex.unbind(); + gl::MatrixMode(gl::MODELVIEW); + gl::LoadIdentity(); + CV_CheckGlError(); + + gl::Disable(gl::LIGHTING); + CV_CheckGlError(); + + tex.bind(); + + gl::Enable(gl::TEXTURE_2D); + CV_CheckGlError(); + + gl::TexEnvi(gl::TEXTURE_ENV, gl::TEXTURE_ENV_MODE, gl::REPLACE); + CV_CheckGlError(); + + gl::TexParameteri(gl::TEXTURE_2D, gl::TEXTURE_MIN_FILTER, gl::LINEAR); + CV_CheckGlError(); + + const float vertex[] = + { + wndRect.x, wndRect.y, 0.0f, + wndRect.x, (wndRect.y + wndRect.height), 0.0f, + wndRect.x + wndRect.width, (wndRect.y + wndRect.height), 0.0f, + wndRect.x + wndRect.width, wndRect.y, 0.0f + }; + const float texCoords[] = + { + texRect.x, texRect.y, + texRect.x, texRect.y + texRect.height, + texRect.x + texRect.width, texRect.y + texRect.height, + texRect.x + texRect.width, texRect.y + }; + + GlBuffer::unbind(GlBuffer::ARRAY_BUFFER); + + gl::EnableClientState(gl::TEXTURE_COORD_ARRAY); + CV_CheckGlError(); + + gl::TexCoordPointer(2, gl::FLOAT, 0, texCoords); + CV_CheckGlError(); + + gl::DisableClientState(gl::NORMAL_ARRAY); + gl::DisableClientState(gl::COLOR_ARRAY); + CV_CheckGlError(); + + gl::EnableClientState(gl::VERTEX_ARRAY); + CV_CheckGlError(); + + gl::VertexPointer(3, gl::FLOAT, 0, vertex); + CV_CheckGlError(); + + gl::DrawArrays(cv::RenderMode::QUADS, 0, 4); + CV_CheckGlError(); } #endif } @@ -1376,222 +1491,90 @@ void cv::render(const GlTexture& tex, Rect_ wndRect, Rect_ texRe void cv::render(const GlArrays& arr, int mode, Scalar color) { #ifndef HAVE_OPENGL - (void)arr; - (void)mode; - (void)color; - throw_nogl; + (void) arr; + (void) mode; + (void) color; + throw_nogl(); #else - glColor3d(color[0] / 255.0, color[1] / 255.0, color[2] / 255.0); - - arr.bind(); - - glDrawArrays(mode, 0, arr.size().area()); - - arr.unbind(); -#endif -} - -void cv::render(const string& str, const Ptr& font, Scalar color, Point2d pos) -{ -#ifndef HAVE_OPENGL - (void)str; - (void)font; - (void)color; - (void)pos; - throw_nogl; -#else - glPushAttrib(GL_DEPTH_BUFFER_BIT); - - GLint viewport[4]; - glGetIntegerv(GL_VIEWPORT, viewport); - - glDisable(GL_DEPTH_TEST); - - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); - - glColor3d(color[0] / 255.0, color[1] / 255.0, color[2] / 255.0); - - glRasterPos2d(2.0 * (viewport[0] + pos.x) / viewport[2] - 1.0, 1.0 - 2.0 * (viewport[1] + pos.y + font->height()) / viewport[3]); - - font->draw(str.c_str(), str.length()); - - glPopAttrib(); -#endif -} - -//////////////////////////////////////////////////////////////////////// -// GlCamera - -cv::GlCamera::GlCamera() : - eye_(0.0, 0.0, -5.0), center_(0.0, 0.0, 0.0), up_(0.0, 1.0, 0.0), - pos_(0.0, 0.0, -5.0), yaw_(0.0), pitch_(0.0), roll_(0.0), - useLookAtParams_(false), - - scale_(1.0, 1.0, 1.0), - - projectionMatrix_(), - fov_(45.0), aspect_(0.0), - left_(0.0), right_(1.0), bottom_(1.0), top_(0.0), - zNear_(-1.0), zFar_(1.0), - perspectiveProjection_(false) -{ -} - -void cv::GlCamera::lookAt(Point3d eye, Point3d center, Point3d up) -{ - eye_ = eye; - center_ = center; - up_ = up; - useLookAtParams_ = true; -} - -void cv::GlCamera::setCameraPos(Point3d pos, double yaw, double pitch, double roll) -{ - pos_ = pos; - yaw_ = yaw; - pitch_ = pitch; - roll_ = roll; - useLookAtParams_ = false; -} - -void cv::GlCamera::setScale(Point3d scale) -{ - scale_ = scale; -} - -void cv::GlCamera::setProjectionMatrix(const Mat& projectionMatrix, bool transpose) -{ - CV_Assert(projectionMatrix.type() == CV_32F || projectionMatrix.type() == CV_64F); - CV_Assert(projectionMatrix.cols == 4 && projectionMatrix.rows == 4); - - projectionMatrix_ = transpose ? projectionMatrix.t() : projectionMatrix; -} - -void cv::GlCamera::setPerspectiveProjection(double fov, double aspect, double zNear, double zFar) -{ - fov_ = fov; - aspect_ = aspect; - zNear_ = zNear; - zFar_ = zFar; - - projectionMatrix_.release(); - perspectiveProjection_ = true; -} - -void cv::GlCamera::setOrthoProjection(double left, double right, double bottom, double top, double zNear, double zFar) -{ - left_ = left; - right_ = right; - bottom_ = bottom; - top_ = top; - zNear_ = zNear; - zFar_ = zFar; - - projectionMatrix_.release(); - perspectiveProjection_ = false; -} - -void cv::GlCamera::setupProjectionMatrix() const -{ -#ifndef HAVE_OPENGL - throw_nogl; -#else - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - - if (projectionMatrix_.empty()) + if (!arr.empty()) { - if (perspectiveProjection_) - gluPerspective(fov_, aspect_, zNear_, zFar_); - else - glOrtho(left_, right_, bottom_, top_, zNear_, zFar_); + gl::Color3d(color[0] / 255.0, color[1] / 255.0, color[2] / 255.0); + + arr.bind(); + + gl::DrawArrays(mode, 0, arr.size()); } - else - { - if (projectionMatrix_.type() == CV_32F) - glLoadMatrixf(projectionMatrix_.ptr()); - else - glLoadMatrixd(projectionMatrix_.ptr()); - } - - CV_CheckGlError(); #endif } -void cv::GlCamera::setupModelViewMatrix() const +void cv::render(const GlArrays& arr, InputArray indices, int mode, Scalar color) { #ifndef HAVE_OPENGL - throw_nogl; + (void) arr; + (void) indices; + (void) mode; + (void) color; + throw_nogl(); #else - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); - - if (useLookAtParams_) - gluLookAt(eye_.x, eye_.y, eye_.z, center_.x, center_.y, center_.z, up_.x, up_.y, up_.z); - else + if (!arr.empty() && !indices.empty()) { - glRotated(-yaw_, 0.0, 1.0, 0.0); - glRotated(-pitch_, 1.0, 0.0, 0.0); - glRotated(-roll_, 0.0, 0.0, 1.0); - glTranslated(-pos_.x, -pos_.y, -pos_.z); - } + gl::Color3d(color[0] / 255.0, color[1] / 255.0, color[2] / 255.0); - glScaled(scale_.x, scale_.y, scale_.z); + arr.bind(); - CV_CheckGlError(); -#endif -} + const int kind = indices.kind(); -//////////////////////////////////////////////////////////////////////// -// Error handling - -bool icvCheckGlError(const char* file, const int line, const char* func) -{ -#ifndef HAVE_OPENGL - (void)file; - (void)line; - (void)func; - return true; -#else - GLenum err = glGetError(); - - if (err != GL_NO_ERROR) - { - const char* msg; - - switch (err) + switch (kind) { - case GL_INVALID_ENUM: - msg = "An unacceptable value is specified for an enumerated argument"; - break; - case GL_INVALID_VALUE: - msg = "A numeric argument is out of range"; - break; - case GL_INVALID_OPERATION: - msg = "The specified operation is not allowed in the current state"; - break; - case GL_STACK_OVERFLOW: - msg = "This command would cause a stack overflow"; - break; - case GL_STACK_UNDERFLOW: - msg = "This command would cause a stack underflow"; - break; - case GL_OUT_OF_MEMORY: - msg = "There is not enough memory left to execute the command"; - break; + case _InputArray::OPENGL_BUFFER : + { + GlBuffer buf = indices.getGlBuffer(); + + const int depth = buf.depth(); + + CV_Assert( buf.channels() == 1 ); + CV_Assert( depth <= CV_32S ); + + GLenum type; + if (depth < CV_16U) + type = gl::UNSIGNED_BYTE; + else if (depth < CV_32S) + type = gl::UNSIGNED_SHORT; + else + type = gl::UNSIGNED_INT; + + buf.bind(GlBuffer::ELEMENT_ARRAY_BUFFER); + + gl::DrawElements(mode, buf.size().area(), type, 0); + + GlBuffer::unbind(GlBuffer::ELEMENT_ARRAY_BUFFER); + + break; + } + default: - msg = "Unknown error"; - }; + { + Mat mat = indices.getMat(); - cvError(CV_OpenGlApiCallError, func, msg, file, line); + const int depth = mat.depth(); - return false; + CV_Assert( mat.channels() == 1 ); + CV_Assert( depth <= CV_32S ); + CV_Assert( mat.isContinuous() ); + + GLenum type; + if (depth < CV_16U) + type = gl::UNSIGNED_BYTE; + else if (depth < CV_32S) + type = gl::UNSIGNED_SHORT; + else + type = gl::UNSIGNED_INT; + + GlBuffer::unbind(GlBuffer::ELEMENT_ARRAY_BUFFER); + + gl::DrawElements(mode, mat.size().area(), type, mat.data); + } + } } - - return true; #endif } diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt index 78aafcf928..2f62826dd5 100644 --- a/modules/gpu/CMakeLists.txt +++ b/modules/gpu/CMakeLists.txt @@ -22,17 +22,14 @@ source_group("Device" FILES ${lib_device_hdrs}) source_group("Device\\Detail" FILES ${lib_device_hdrs_detail}) if (HAVE_CUDA) - file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp") + file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp" "src/nvidia/*.h*") file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu") - file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h") - set(ncv_files ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda}) + set(ncv_files ${ncv_srcs} ${ncv_cuda}) source_group("Src\\NVidia" FILES ${ncv_files}) ocv_include_directories("src/nvidia" "src/nvidia/core" "src/nvidia/NPP_staging" ${CUDA_INCLUDE_DIRS}) ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations /wd4211 /wd4201 /wd4100 /wd4505 /wd4408) string(REPLACE "-Wsign-promo" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - - #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep") #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;") if(MSVC) @@ -47,23 +44,18 @@ if (HAVE_CUDA) ocv_cuda_compile(cuda_objs ${lib_cuda} ${ncv_cuda}) - #CUDA_BUILD_CLEAN_TARGET() - set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) - if(NOT APPLE) - unset(CUDA_nvcuvid_LIBRARY CACHE) - find_cuda_helper_libs(nvcuvid) + if(WITH_NVCUVID) set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY}) endif() if(WIN32) - unset(CUDA_nvcuvenc_LIBRARY CACHE) find_cuda_helper_libs(nvcuvenc) set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvenc_LIBRARY}) endif() - if(NOT APPLE AND WITH_FFMPEG) + if(WITH_FFMPEG) set(cuda_link_libs ${cuda_link_libs} ${HIGHGUI_LIBRARIES}) endif() else() diff --git a/modules/gpu/app/nv_perf_test/CMakeLists.txt b/modules/gpu/app/nv_perf_test/CMakeLists.txt new file mode 100644 index 0000000000..c13f5ef46b --- /dev/null +++ b/modules/gpu/app/nv_perf_test/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 2.8.3) + +project(nv_perf_test) + +find_package(OpenCV REQUIRED) +include_directories(${OpenCV_INCLUDE_DIR}) + +add_executable(${PROJECT_NAME} main.cpp) + +target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS}) diff --git a/modules/gpu/app/nv_perf_test/im1_1280x800.jpg b/modules/gpu/app/nv_perf_test/im1_1280x800.jpg new file mode 100644 index 0000000000..bdbbd4aee9 Binary files /dev/null and b/modules/gpu/app/nv_perf_test/im1_1280x800.jpg differ diff --git a/modules/gpu/app/nv_perf_test/im2_1280x800.jpg b/modules/gpu/app/nv_perf_test/im2_1280x800.jpg new file mode 100644 index 0000000000..ae49640a95 Binary files /dev/null and b/modules/gpu/app/nv_perf_test/im2_1280x800.jpg differ diff --git a/modules/gpu/app/nv_perf_test/main.cpp b/modules/gpu/app/nv_perf_test/main.cpp new file mode 100644 index 0000000000..928b30a19e --- /dev/null +++ b/modules/gpu/app/nv_perf_test/main.cpp @@ -0,0 +1,489 @@ +#include +#define HAVE_CUDA 1 +#include +#include +#include +#include +#include +#include +#include + +static void printOsInfo() +{ +#if defined _WIN32 +# if defined _WIN64 + printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"); fflush(stdout); +# else + printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"); fflush(stdout); +# endif +#elif defined linux +# if defined _LP64 + printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"); fflush(stdout); +# else + printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"); fflush(stdout); +# endif +#elif defined __APPLE__ +# if defined _LP64 + printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"); fflush(stdout); +# else + printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"); fflush(stdout); +# endif +#endif +} + +static void printCudaInfo() +{ + const int deviceCount = cv::gpu::getCudaEnabledDeviceCount(); + + printf("[----------]\n"); fflush(stdout); + printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount); fflush(stdout); + printf("[----------]\n"); fflush(stdout); + + for (int i = 0; i < deviceCount; ++i) + { + cv::gpu::DeviceInfo info(i); + + printf("[----------]\n"); fflush(stdout); + printf("[ DEVICE ] \t# %d %s.\n", i, info.name().c_str()); fflush(stdout); + printf("[ ] \tCompute capability: %d.%d\n", info.majorVersion(), info.minorVersion()); fflush(stdout); + printf("[ ] \tMulti Processor Count: %d\n", info.multiProcessorCount()); fflush(stdout); + printf("[ ] \tTotal memory: %d Mb\n", static_cast(static_cast(info.totalMemory() / 1024.0) / 1024.0)); fflush(stdout); + printf("[ ] \tFree memory: %d Mb\n", static_cast(static_cast(info.freeMemory() / 1024.0) / 1024.0)); fflush(stdout); + if (!info.isCompatible()) + printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n"); + printf("[----------]\n"); fflush(stdout); + } +} + +int main(int argc, char* argv[]) +{ + printOsInfo(); + printCudaInfo(); + + perf::Regression::Init("nv_perf_test"); + perf::TestBase::Init(argc, argv); + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} + +#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name +#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name + +////////////////////////////////////////////////////////// +// HoughLinesP + +DEF_PARAM_TEST_1(Image, std::string); + +PERF_TEST_P(Image, HoughLinesP, + testing::Values(std::string("im1_1280x800.jpg"))) +{ + declare.time(30.0); + + std::string fileName = GetParam(); + + const double rho = 1.0; + const double theta = 1.0; + const int threshold = 40; + const int minLineLenght = 20; + const int maxLineGap = 5; + + cv::Mat image = cv::imread(fileName, cv::IMREAD_GRAYSCALE); + + if (PERF_RUN_GPU()) + { + cv::gpu::GpuMat d_image(image); + cv::gpu::GpuMat d_lines; + cv::gpu::HoughLinesBuf d_buf; + + cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap); + + TEST_CYCLE() + { + cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap); + } + } + else + { + cv::Mat mask; + cv::Canny(image, mask, 50, 100); + + std::vector lines; + cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap); + + TEST_CYCLE() + { + cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap); + } + } + + SANITY_CHECK(0); +} + +////////////////////////////////////////////////////////// +// GoodFeaturesToTrack + +DEF_PARAM_TEST(Image_Depth, std::string, perf::MatDepth); + +PERF_TEST_P(Image_Depth, GoodFeaturesToTrack, + testing::Combine( + testing::Values(std::string("im1_1280x800.jpg")), + testing::Values(CV_8U, CV_16U) + )) +{ + declare.time(60); + + const std::string fileName = std::tr1::get<0>(GetParam()); + const int depth = std::tr1::get<1>(GetParam()); + + const int maxCorners = 5000; + const double qualityLevel = 0.05; + const int minDistance = 5; + const int blockSize = 3; + const bool useHarrisDetector = true; + const double k = 0.05; + + cv::Mat src = cv::imread(fileName, cv::IMREAD_GRAYSCALE); + if (src.empty()) + FAIL() << "Unable to load source image [" << fileName << "]"; + + if (depth != CV_8U) + src.convertTo(src, depth); + + cv::Mat mask(src.size(), CV_8UC1, cv::Scalar::all(1)); + mask(cv::Rect(0, 0, 100, 100)).setTo(cv::Scalar::all(0)); + + if (PERF_RUN_GPU()) + { + cv::gpu::GoodFeaturesToTrackDetector_GPU d_detector(maxCorners, qualityLevel, minDistance, blockSize, useHarrisDetector, k); + + cv::gpu::GpuMat d_src(src); + cv::gpu::GpuMat d_mask(mask); + cv::gpu::GpuMat d_pts; + + d_detector(d_src, d_pts, d_mask); + + TEST_CYCLE() + { + d_detector(d_src, d_pts, d_mask); + } + } + else + { + if (depth != CV_8U) + FAIL() << "Unsupported depth"; + + cv::Mat pts; + + cv::goodFeaturesToTrack(src, pts, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k); + + TEST_CYCLE() + { + cv::goodFeaturesToTrack(src, pts, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k); + } + } + + SANITY_CHECK(0); +} + +////////////////////////////////////////////////////////// +// OpticalFlowPyrLKSparse + +typedef std::pair string_pair; + +DEF_PARAM_TEST(ImagePair_Depth_GraySource, string_pair, perf::MatDepth, bool); + +PERF_TEST_P(ImagePair_Depth_GraySource, OpticalFlowPyrLKSparse, + testing::Combine( + testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")), + testing::Values(CV_8U, CV_16U), + testing::Bool() + )) +{ + declare.time(60); + + const string_pair fileNames = std::tr1::get<0>(GetParam()); + const int depth = std::tr1::get<1>(GetParam()); + const bool graySource = std::tr1::get<2>(GetParam()); + + // PyrLK params + const cv::Size winSize(15, 15); + const int maxLevel = 5; + const cv::TermCriteria criteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 30, 0.01); + + // GoodFeaturesToTrack params + const int maxCorners = 5000; + const double qualityLevel = 0.05; + const int minDistance = 5; + const int blockSize = 3; + const bool useHarrisDetector = true; + const double k = 0.05; + + cv::Mat src1 = cv::imread(fileNames.first, graySource ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); + if (src1.empty()) + FAIL() << "Unable to load source image [" << fileNames.first << "]"; + + cv::Mat src2 = cv::imread(fileNames.second, graySource ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); + if (src2.empty()) + FAIL() << "Unable to load source image [" << fileNames.second << "]"; + + cv::Mat gray_src; + if (graySource) + gray_src = src1; + else + cv::cvtColor(src1, gray_src, cv::COLOR_BGR2GRAY); + + cv::Mat pts; + cv::goodFeaturesToTrack(gray_src, pts, maxCorners, qualityLevel, minDistance, cv::noArray(), blockSize, useHarrisDetector, k); + + if (depth != CV_8U) + { + src1.convertTo(src1, depth); + src2.convertTo(src2, depth); + } + + if (PERF_RUN_GPU()) + { + cv::gpu::GpuMat d_src1(src1); + cv::gpu::GpuMat d_src2(src2); + cv::gpu::GpuMat d_pts(pts.reshape(2, 1)); + cv::gpu::GpuMat d_nextPts; + cv::gpu::GpuMat d_status; + + cv::gpu::PyrLKOpticalFlow d_pyrLK; + d_pyrLK.winSize = winSize; + d_pyrLK.maxLevel = maxLevel; + d_pyrLK.iters = criteria.maxCount; + d_pyrLK.useInitialFlow = false; + + d_pyrLK.sparse(d_src1, d_src2, d_pts, d_nextPts, d_status); + + TEST_CYCLE() + { + d_pyrLK.sparse(d_src1, d_src2, d_pts, d_nextPts, d_status); + } + } + else + { + if (depth != CV_8U) + FAIL() << "Unsupported depth"; + + cv::Mat nextPts; + cv::Mat status; + + cv::calcOpticalFlowPyrLK(src1, src2, pts, nextPts, status, cv::noArray(), winSize, maxLevel, criteria); + + TEST_CYCLE() + { + cv::calcOpticalFlowPyrLK(src1, src2, pts, nextPts, status, cv::noArray(), winSize, maxLevel, criteria); + } + } + + SANITY_CHECK(0); +} + +////////////////////////////////////////////////////////// +// OpticalFlowFarneback + +DEF_PARAM_TEST(ImagePair_Depth, string_pair, perf::MatDepth); + +PERF_TEST_P(ImagePair_Depth, OpticalFlowFarneback, + testing::Combine( + testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")), + testing::Values(CV_8U, CV_16U) + )) +{ + declare.time(500); + + const string_pair fileNames = std::tr1::get<0>(GetParam()); + const int depth = std::tr1::get<1>(GetParam()); + + const double pyrScale = 0.5; + const int numLevels = 6; + const int winSize = 7; + const int numIters = 15; + const int polyN = 7; + const double polySigma = 1.5; + const int flags = cv::OPTFLOW_USE_INITIAL_FLOW; + + cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE); + if (src1.empty()) + FAIL() << "Unable to load source image [" << fileNames.first << "]"; + + cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE); + if (src2.empty()) + FAIL() << "Unable to load source image [" << fileNames.second << "]"; + + if (depth != CV_8U) + { + src1.convertTo(src1, depth); + src2.convertTo(src2, depth); + } + + if (PERF_RUN_GPU()) + { + cv::gpu::GpuMat d_src1(src1); + cv::gpu::GpuMat d_src2(src2); + cv::gpu::GpuMat d_u(src1.size(), CV_32FC1, cv::Scalar::all(0)); + cv::gpu::GpuMat d_v(src1.size(), CV_32FC1, cv::Scalar::all(0)); + + cv::gpu::FarnebackOpticalFlow d_farneback; + d_farneback.pyrScale = pyrScale; + d_farneback.numLevels = numLevels; + d_farneback.winSize = winSize; + d_farneback.numIters = numIters; + d_farneback.polyN = polyN; + d_farneback.polySigma = polySigma; + d_farneback.flags = flags; + + d_farneback(d_src1, d_src2, d_u, d_v); + + TEST_CYCLE_N(10) + { + d_farneback(d_src1, d_src2, d_u, d_v); + } + } + else + { + if (depth != CV_8U) + FAIL() << "Unsupported depth"; + + cv::Mat flow(src1.size(), CV_32FC2, cv::Scalar::all(0)); + + cv::calcOpticalFlowFarneback(src1, src2, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags); + + TEST_CYCLE_N(10) + { + cv::calcOpticalFlowFarneback(src1, src2, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags); + } + } + + SANITY_CHECK(0); +} + +////////////////////////////////////////////////////////// +// OpticalFlowBM + +void calcOpticalFlowBM(const cv::Mat& prev, const cv::Mat& curr, + cv::Size bSize, cv::Size shiftSize, cv::Size maxRange, int usePrevious, + cv::Mat& velx, cv::Mat& vely) +{ + cv::Size sz((curr.cols - bSize.width + shiftSize.width)/shiftSize.width, (curr.rows - bSize.height + shiftSize.height)/shiftSize.height); + + velx.create(sz, CV_32FC1); + vely.create(sz, CV_32FC1); + + CvMat cvprev = prev; + CvMat cvcurr = curr; + + CvMat cvvelx = velx; + CvMat cvvely = vely; + + cvCalcOpticalFlowBM(&cvprev, &cvcurr, bSize, shiftSize, maxRange, usePrevious, &cvvelx, &cvvely); +} + +DEF_PARAM_TEST(ImagePair_BlockSize_ShiftSize_MaxRange, string_pair, cv::Size, cv::Size, cv::Size); + +PERF_TEST_P(ImagePair_BlockSize_ShiftSize_MaxRange, OpticalFlowBM, + testing::Combine( + testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")), + testing::Values(cv::Size(16, 16)), + testing::Values(cv::Size(2, 2)), + testing::Values(cv::Size(16, 16)) + )) +{ + declare.time(1000); + + const string_pair fileNames = std::tr1::get<0>(GetParam()); + const cv::Size block_size = std::tr1::get<1>(GetParam()); + const cv::Size shift_size = std::tr1::get<2>(GetParam()); + const cv::Size max_range = std::tr1::get<3>(GetParam()); + + cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE); + if (src1.empty()) + FAIL() << "Unable to load source image [" << fileNames.first << "]"; + + cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE); + if (src2.empty()) + FAIL() << "Unable to load source image [" << fileNames.second << "]"; + + if (PERF_RUN_GPU()) + { + cv::gpu::GpuMat d_src1(src1); + cv::gpu::GpuMat d_src2(src2); + cv::gpu::GpuMat d_velx, d_vely, buf; + + cv::gpu::calcOpticalFlowBM(d_src1, d_src2, block_size, shift_size, max_range, false, d_velx, d_vely, buf); + + TEST_CYCLE_N(10) + { + cv::gpu::calcOpticalFlowBM(d_src1, d_src2, block_size, shift_size, max_range, false, d_velx, d_vely, buf); + } + } + else + { + cv::Mat velx, vely; + + calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely); + + TEST_CYCLE_N(10) + { + calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely); + } + } + + SANITY_CHECK(0); +} + +PERF_TEST_P(ImagePair_BlockSize_ShiftSize_MaxRange, FastOpticalFlowBM, + testing::Combine( + testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")), + testing::Values(cv::Size(16, 16)), + testing::Values(cv::Size(1, 1)), + testing::Values(cv::Size(16, 16)) + )) +{ + declare.time(1000); + + const string_pair fileNames = std::tr1::get<0>(GetParam()); + const cv::Size block_size = std::tr1::get<1>(GetParam()); + const cv::Size shift_size = std::tr1::get<2>(GetParam()); + const cv::Size max_range = std::tr1::get<3>(GetParam()); + + cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE); + if (src1.empty()) + FAIL() << "Unable to load source image [" << fileNames.first << "]"; + + cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE); + if (src2.empty()) + FAIL() << "Unable to load source image [" << fileNames.second << "]"; + + if (PERF_RUN_GPU()) + { + cv::gpu::GpuMat d_src1(src1); + cv::gpu::GpuMat d_src2(src2); + cv::gpu::GpuMat d_velx, d_vely; + + cv::gpu::FastOpticalFlowBM fastBM; + + fastBM(d_src1, d_src2, d_velx, d_vely, max_range.width, block_size.width); + + TEST_CYCLE_N(10) + { + fastBM(d_src1, d_src2, d_velx, d_vely, max_range.width, block_size.width); + } + } + else + { + cv::Mat velx, vely; + + calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely); + + TEST_CYCLE_N(10) + { + calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely); + } + } + + SANITY_CHECK(0); +} diff --git a/modules/gpu/doc/object_detection.rst b/modules/gpu/doc/object_detection.rst index 133660236a..a1118b780a 100644 --- a/modules/gpu/doc/object_detection.rst +++ b/modules/gpu/doc/object_detection.rst @@ -199,6 +199,91 @@ Returns block descriptors computed for the whole image. The function is mainly used to learn the classifier. +Soft Cascade Classifier +========================== + +Soft Cascade Classifier for Object Detection +---------------------------------------------------------- + +Cascade detectors have been shown to operate extremely rapidly, with high accuracy, and have important applications in different spheres. The initial goal for this cascade implementation was the fast and accurate pedestrian detector but it also useful in general. Soft cascade is trained with AdaBoost. But instead of training sequence of stages, the soft cascade is trained as a one long stage of T weak classifiers. Soft cascade is formulated as follows: + +.. math:: + \texttt{H}(x) = \sum _{\texttt{t}=1..\texttt{T}} {\texttt{s}_t(x)} + +where :math:`\texttt{s}_t(x) = \alpha_t\texttt{h}_t(x)` are the set of thresholded weak classifiers selected during AdaBoost training scaled by the associated weights. Let + +.. math:: + \texttt{H}_t(x) = \sum _{\texttt{i}=1..\texttt{t}} {\texttt{s}_i(x)} + +be the partial sum of sample responses before :math:`t`-the weak classifier will be applied. The funtcion :math:`\texttt{H}_t(x)` of :math:`t` for sample :math:`x` named *sample trace*. +After each weak classifier evaluation, the sample trace at the point :math:`t` is compared with the rejection threshold :math:`r_t`. The sequence of :math:`r_t` named *rejection trace*. + +The sample has been rejected if it fall rejection threshold. So stageless cascade allows to reject not-object sample as soon as possible. Another meaning of the sample trace is a confidence with that sample recognized as desired object. At each :math:`t` that confidence depend on all previous weak classifier. This feature of soft cascade is resulted in more accurate detection. The original formulation of soft cascade can be found in [BJ05]_. + +.. [BJ05] Lubomir Bourdev and Jonathan Brandt. tRobust Object Detection Via Soft Cascade. IEEE CVPR, 2005. +.. [BMTG12] Rodrigo Benenson, Markus Mathias, Radu Timofte and Luc Van Gool. Pedestrian detection at 100 frames per second. IEEE CVPR, 2012. + + +gpu::SCascade +----------------------------------------------- +.. ocv:class:: gpu::SCascade : public Algorithm + +Implementation of soft (stageless) cascaded detector. :: + + class CV_EXPORTS SCascade : public Algorithm + { + struct CV_EXPORTS Detection + { + ushort x; + ushort y; + ushort w; + ushort h; + float confidence; + int kind; + + enum {PEDESTRIAN = 0}; + }; + + SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1); + virtual ~SCascade(); + virtual bool load(const FileNode& fn); + virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const; + virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const; + }; + + +gpu::SCascade::~SCascade +--------------------------- +Destructor for SCascade. + +.. ocv:function:: gpu::SCascade::~SCascade() + + + +gpu::SCascade::load +-------------------------- +Load cascade from FileNode. + +.. ocv:function:: bool gpu::SCascade::load(const FileNode& fn) + + :param fn: File node from which the soft cascade are read. + + + +gpu::SCascade::detect +-------------------------- +Apply cascade to an input frame and return the vector of Decection objcts. + +.. ocv:function:: void gpu::SCascade::detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const + + :param image: a frame on which detector will be applied. + + :param rois: a regions of interests mask generated by genRoi. Only the objects that fall into one of the regions will be returned. + + :param objects: an output array of Detections represented as GpuMat of detections (SCascade::Detection). The first element of the matrix is actually a count of detections. + + :param stream: a high-level CUDA stream abstraction used for asynchronous execution. + gpu::CascadeClassifier_GPU -------------------------- diff --git a/modules/gpu/src/opencv2/gpu/device/block.hpp b/modules/gpu/include/opencv2/gpu/device/block.hpp similarity index 100% rename from modules/gpu/src/opencv2/gpu/device/block.hpp rename to modules/gpu/include/opencv2/gpu/device/block.hpp diff --git a/modules/gpu/include/opencv2/gpu/device/common.hpp b/modules/gpu/include/opencv2/gpu/device/common.hpp index 141467fdc8..931e4247e9 100644 --- a/modules/gpu/include/opencv2/gpu/device/common.hpp +++ b/modules/gpu/include/opencv2/gpu/device/common.hpp @@ -85,8 +85,6 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int cv::gpu::error(cudaGetErrorString(err), file, line, func); } -#ifdef __CUDACC__ - namespace cv { namespace gpu { __host__ __device__ __forceinline__ int divUp(int total, int grain) @@ -96,19 +94,25 @@ namespace cv { namespace gpu namespace device { + using cv::gpu::divUp; + +#ifdef __CUDACC__ typedef unsigned char uchar; typedef unsigned short ushort; typedef signed char schar; - typedef unsigned int uint; + #ifdef WIN32 + typedef unsigned int uint; + #endif template inline void bindTexture(const textureReference* tex, const PtrStepSz& img) { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) ); } +#endif // __CUDACC__ } }} -#endif // __CUDACC__ + #endif // __OPENCV_GPU_COMMON_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp b/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp index 981e62335c..fb3bfeb9ee 100644 --- a/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp +++ b/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp @@ -807,9 +807,9 @@ namespace cv { namespace gpu { namespace device template static __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst) { + dst.z = saturate_cast(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift)); dst.x = saturate_cast(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[0] + src[1] * c_RGB2XYZ_D65i[1] + src[bidx] * c_RGB2XYZ_D65i[2], xyz_shift)); dst.y = saturate_cast(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[3] + src[1] * c_RGB2XYZ_D65i[4] + src[bidx] * c_RGB2XYZ_D65i[5], xyz_shift)); - dst.z = saturate_cast(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift)); } template static __device__ __forceinline__ uint RGB2XYZConvert(uint src) diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp new file mode 100644 index 0000000000..091a160e31 --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp @@ -0,0 +1,361 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__ +#define __OPENCV_GPU_REDUCE_DETAIL_HPP__ + +#include +#include "../warp.hpp" +#include "../warp_shuffle.hpp" + +namespace cv { namespace gpu { namespace device +{ + namespace reduce_detail + { + template struct GetType; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid) + { + thrust::get(smem)[tid] = thrust::get(val); + + For::loadToSmem(smem, val, tid); + } + template + static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid) + { + thrust::get(val) = thrust::get(smem)[tid]; + + For::loadFromSmem(smem, val, tid); + } + + template + static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op) + { + typename GetType::type>::type reg = thrust::get(smem)[tid + delta]; + thrust::get(smem)[tid] = thrust::get(val) = thrust::get(op)(thrust::get(val), reg); + + For::merge(smem, val, tid, delta, op); + } + template + static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op) + { + typename GetType::type>::type reg = shfl_down(thrust::get(val), delta, width); + thrust::get(val) = thrust::get(op)(thrust::get(val), reg); + + For::mergeShfl(val, delta, width, op); + } + }; + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int) + { + } + template + static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int) + { + } + + template + static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&) + { + } + template + static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&) + { + } + }; + + template + __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid) + { + smem[tid] = val; + } + template + __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid) + { + val = smem[tid]; + } + template + __device__ __forceinline__ void loadToSmem(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadToSmem(smem, val, tid); + } + template + __device__ __forceinline__ void loadFromSmem(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadFromSmem(smem, val, tid); + } + + template + __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op) + { + T reg = smem[tid + delta]; + smem[tid] = val = op(val, reg); + } + template + __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op) + { + T reg = shfl_down(val, delta, width); + val = op(val, reg); + } + template + __device__ __forceinline__ void merge(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid, + unsigned int delta, + const thrust::tuple& op) + { + For<0, thrust::tuple_size >::value>::merge(smem, val, tid, delta, op); + } + template + __device__ __forceinline__ void mergeShfl(const thrust::tuple& val, + unsigned int delta, + unsigned int width, + const thrust::tuple& op) + { + For<0, thrust::tuple_size >::value>::mergeShfl(val, delta, width, op); + } + + template struct Generic + { + template + static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) + { + loadToSmem(smem, val, tid); + if (N >= 32) + __syncthreads(); + + if (N >= 2048) + { + if (tid < 1024) + merge(smem, val, tid, 1024, op); + + __syncthreads(); + } + if (N >= 1024) + { + if (tid < 512) + merge(smem, val, tid, 512, op); + + __syncthreads(); + } + if (N >= 512) + { + if (tid < 256) + merge(smem, val, tid, 256, op); + + __syncthreads(); + } + if (N >= 256) + { + if (tid < 128) + merge(smem, val, tid, 128, op); + + __syncthreads(); + } + if (N >= 128) + { + if (tid < 64) + merge(smem, val, tid, 64, op); + + __syncthreads(); + } + if (N >= 64) + { + if (tid < 32) + merge(smem, val, tid, 32, op); + } + + if (tid < 16) + { + merge(smem, val, tid, 16, op); + merge(smem, val, tid, 8, op); + merge(smem, val, tid, 4, op); + merge(smem, val, tid, 2, op); + merge(smem, val, tid, 1, op); + } + } + }; + + template + struct Unroll + { + static __device__ void loopShfl(Reference val, Op op, unsigned int N) + { + mergeShfl(val, I, N, op); + Unroll::loopShfl(val, op, N); + } + static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op) + { + merge(smem, val, tid, I, op); + Unroll::loop(smem, val, tid, op); + } + }; + template + struct Unroll<0, Pointer, Reference, Op> + { + static __device__ void loopShfl(Reference, Op, unsigned int) + { + } + static __device__ void loop(Pointer, Reference, unsigned int, Op) + { + } + }; + + template struct WarpOptimized + { + template + static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) + { + #if __CUDA_ARCH__ >= 300 + (void) smem; + (void) tid; + + Unroll::loopShfl(val, op, N); + #else + loadToSmem(smem, val, tid); + + if (tid < N / 2) + Unroll::loop(smem, val, tid, op); + #endif + } + }; + + template struct GenericOptimized32 + { + enum { M = N / 32 }; + + template + static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) + { + const unsigned int laneId = Warp::laneId(); + + #if __CUDA_ARCH__ >= 300 + Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize); + + if (laneId == 0) + loadToSmem(smem, val, tid / 32); + #else + loadToSmem(smem, val, tid); + + if (laneId < 16) + Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op); + + __syncthreads(); + + if (laneId == 0) + loadToSmem(smem, val, tid / 32); + #endif + + __syncthreads(); + + loadFromSmem(smem, val, tid); + + if (tid < 32) + { + #if __CUDA_ARCH__ >= 300 + Unroll::loopShfl(val, op, M); + #else + Unroll::loop(smem, val, tid, op); + #endif + } + } + }; + + template struct StaticIf; + template struct StaticIf + { + typedef T1 type; + }; + template struct StaticIf + { + typedef T2 type; + }; + + template struct IsPowerOf2 + { + enum { value = ((N != 0) && !(N & (N - 1))) }; + }; + + template struct Dispatcher + { + typedef typename StaticIf< + (N <= 32) && IsPowerOf2::value, + WarpOptimized, + typename StaticIf< + (N <= 1024) && IsPowerOf2::value, + GenericOptimized32, + Generic + >::type + >::type reductor; + }; + } +}}} + +#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp new file mode 100644 index 0000000000..a84e0c2fd0 --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp @@ -0,0 +1,498 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ +#define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ + +#include +#include "../warp.hpp" +#include "../warp_shuffle.hpp" + +namespace cv { namespace gpu { namespace device +{ + namespace reduce_key_val_detail + { + template struct GetType; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid) + { + thrust::get(smem)[tid] = thrust::get(data); + + For::loadToSmem(smem, data, tid); + } + template + static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid) + { + thrust::get(data) = thrust::get(smem)[tid]; + + For::loadFromSmem(smem, data, tid); + } + + template + static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width) + { + thrust::get(val) = shfl_down(thrust::get(val), delta, width); + + For::copyShfl(val, delta, width); + } + template + static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta) + { + thrust::get(svals)[tid] = thrust::get(val) = thrust::get(svals)[tid + delta]; + + For::copy(svals, val, tid, delta); + } + + template + static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width) + { + typename GetType::type>::type reg = shfl_down(thrust::get(key), delta, width); + + if (thrust::get(cmp)(reg, thrust::get(key))) + { + thrust::get(key) = reg; + thrust::get(val) = shfl_down(thrust::get(val), delta, width); + } + + For::mergeShfl(key, val, cmp, delta, width); + } + template + static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key, + const ValPointerTuple& svals, const ValReferenceTuple& val, + const CmpTuple& cmp, + unsigned int tid, unsigned int delta) + { + typename GetType::type>::type reg = thrust::get(skeys)[tid + delta]; + + if (thrust::get(cmp)(reg, thrust::get(key))) + { + thrust::get(skeys)[tid] = thrust::get(key) = reg; + thrust::get(svals)[tid] = thrust::get(val) = thrust::get(svals)[tid + delta]; + } + + For::merge(skeys, key, svals, val, cmp, tid, delta); + } + }; + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int) + { + } + template + static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int) + { + } + + template + static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int) + { + } + template + static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int) + { + } + + template + static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int) + { + } + template + static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&, + const ValPointerTuple&, const ValReferenceTuple&, + const CmpTuple&, + unsigned int, unsigned int) + { + } + }; + + ////////////////////////////////////////////////////// + // loadToSmem + + template + __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid) + { + smem[tid] = data; + } + template + __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid) + { + data = smem[tid]; + } + template + __device__ __forceinline__ void loadToSmem(const thrust::tuple& smem, + const thrust::tuple& data, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadToSmem(smem, data, tid); + } + template + __device__ __forceinline__ void loadFromSmem(const thrust::tuple& smem, + const thrust::tuple& data, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadFromSmem(smem, data, tid); + } + + ////////////////////////////////////////////////////// + // copyVals + + template + __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width) + { + val = shfl_down(val, delta, width); + } + template + __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta) + { + svals[tid] = val = svals[tid + delta]; + } + template + __device__ __forceinline__ void copyValsShfl(const thrust::tuple& val, + unsigned int delta, + int width) + { + For<0, thrust::tuple_size >::value>::copyShfl(val, delta, width); + } + template + __device__ __forceinline__ void copyVals(const thrust::tuple& svals, + const thrust::tuple& val, + unsigned int tid, unsigned int delta) + { + For<0, thrust::tuple_size >::value>::copy(svals, val, tid, delta); + } + + ////////////////////////////////////////////////////// + // merge + + template + __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width) + { + K reg = shfl_down(key, delta, width); + + if (cmp(reg, key)) + { + key = reg; + copyValsShfl(val, delta, width); + } + } + template + __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta) + { + K reg = skeys[tid + delta]; + + if (cmp(reg, key)) + { + skeys[tid] = key = reg; + copyVals(svals, val, tid, delta); + } + } + template + __device__ __forceinline__ void mergeShfl(K& key, + const thrust::tuple& val, + const Cmp& cmp, + unsigned int delta, int width) + { + K reg = shfl_down(key, delta, width); + + if (cmp(reg, key)) + { + key = reg; + copyValsShfl(val, delta, width); + } + } + template + __device__ __forceinline__ void merge(volatile K* skeys, K& key, + const thrust::tuple& svals, + const thrust::tuple& val, + const Cmp& cmp, unsigned int tid, unsigned int delta) + { + K reg = skeys[tid + delta]; + + if (cmp(reg, key)) + { + skeys[tid] = key = reg; + copyVals(svals, val, tid, delta); + } + } + template + __device__ __forceinline__ void mergeShfl(const thrust::tuple& key, + const thrust::tuple& val, + const thrust::tuple& cmp, + unsigned int delta, int width) + { + For<0, thrust::tuple_size >::value>::mergeShfl(key, val, cmp, delta, width); + } + template + __device__ __forceinline__ void merge(const thrust::tuple& skeys, + const thrust::tuple& key, + const thrust::tuple& svals, + const thrust::tuple& val, + const thrust::tuple& cmp, + unsigned int tid, unsigned int delta) + { + For<0, thrust::tuple_size >::value>::merge(skeys, key, svals, val, cmp, tid, delta); + } + + ////////////////////////////////////////////////////// + // Generic + + template struct Generic + { + template + static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + loadToSmem(skeys, key, tid); + loadValsToSmem(svals, val, tid); + if (N >= 32) + __syncthreads(); + + if (N >= 2048) + { + if (tid < 1024) + merge(skeys, key, svals, val, cmp, tid, 1024); + + __syncthreads(); + } + if (N >= 1024) + { + if (tid < 512) + merge(skeys, key, svals, val, cmp, tid, 512); + + __syncthreads(); + } + if (N >= 512) + { + if (tid < 256) + merge(skeys, key, svals, val, cmp, tid, 256); + + __syncthreads(); + } + if (N >= 256) + { + if (tid < 128) + merge(skeys, key, svals, val, cmp, tid, 128); + + __syncthreads(); + } + if (N >= 128) + { + if (tid < 64) + merge(skeys, key, svals, val, cmp, tid, 64); + + __syncthreads(); + } + if (N >= 64) + { + if (tid < 32) + merge(skeys, key, svals, val, cmp, tid, 32); + } + + if (tid < 16) + { + merge(skeys, key, svals, val, cmp, tid, 16); + merge(skeys, key, svals, val, cmp, tid, 8); + merge(skeys, key, svals, val, cmp, tid, 4); + merge(skeys, key, svals, val, cmp, tid, 2); + merge(skeys, key, svals, val, cmp, tid, 1); + } + } + }; + + template + struct Unroll + { + static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N) + { + mergeShfl(key, val, cmp, I, N); + Unroll::loopShfl(key, val, cmp, N); + } + static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + merge(skeys, key, svals, val, cmp, tid, I); + Unroll::loop(skeys, key, svals, val, tid, cmp); + } + }; + template + struct Unroll<0, KP, KR, VP, VR, Cmp> + { + static __device__ void loopShfl(KR, VR, Cmp, unsigned int) + { + } + static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp) + { + } + }; + + template struct WarpOptimized + { + template + static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + #if 0 // __CUDA_ARCH__ >= 300 + (void) skeys; + (void) svals; + (void) tid; + + Unroll::loopShfl(key, val, cmp, N); + #else + loadToSmem(skeys, key, tid); + loadToSmem(svals, val, tid); + + if (tid < N / 2) + Unroll::loop(skeys, key, svals, val, tid, cmp); + #endif + } + }; + + template struct GenericOptimized32 + { + enum { M = N / 32 }; + + template + static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + const unsigned int laneId = Warp::laneId(); + + #if 0 // __CUDA_ARCH__ >= 300 + Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize); + + if (laneId == 0) + { + loadToSmem(skeys, key, tid / 32); + loadToSmem(svals, val, tid / 32); + } + #else + loadToSmem(skeys, key, tid); + loadToSmem(svals, val, tid); + + if (laneId < 16) + Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp); + + __syncthreads(); + + if (laneId == 0) + { + loadToSmem(skeys, key, tid / 32); + loadToSmem(svals, val, tid / 32); + } + #endif + + __syncthreads(); + + loadFromSmem(skeys, key, tid); + + if (tid < 32) + { + #if 0 // __CUDA_ARCH__ >= 300 + loadFromSmem(svals, val, tid); + + Unroll::loopShfl(key, val, cmp, M); + #else + Unroll::loop(skeys, key, svals, val, tid, cmp); + #endif + } + } + }; + + template struct StaticIf; + template struct StaticIf + { + typedef T1 type; + }; + template struct StaticIf + { + typedef T2 type; + }; + + template struct IsPowerOf2 + { + enum { value = ((N != 0) && !(N & (N - 1))) }; + }; + + template struct Dispatcher + { + typedef typename StaticIf< + (N <= 32) && IsPowerOf2::value, + WarpOptimized, + typename StaticIf< + (N <= 1024) && IsPowerOf2::value, + GenericOptimized32, + Generic + >::type + >::type reductor; + }; + } +}}} + +#endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp deleted file mode 100644 index 0274f204a2..0000000000 --- a/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp +++ /dev/null @@ -1,841 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#ifndef __OPENCV_GPU_REDUCTION_DETAIL_HPP__ -#define __OPENCV_GPU_REDUCTION_DETAIL_HPP__ - -namespace cv { namespace gpu { namespace device -{ - namespace utility_detail - { - /////////////////////////////////////////////////////////////////////////////// - // Reductor - - template struct WarpReductor - { - template static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - if (tid < n) - data[tid] = partial_reduction; - if (n > 32) __syncthreads(); - - if (n > 32) - { - if (tid < n - 32) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); - if (tid < 16) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 16) - { - if (tid < n - 16) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - if (tid < 8) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 8) - { - if (tid < n - 8) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); - if (tid < 4) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 4) - { - if (tid < n - 4) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - if (tid < 2) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 2) - { - if (tid < n - 2) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - if (tid < 2) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - } - }; - template <> struct WarpReductor<64> - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - data[tid] = partial_reduction; - __syncthreads(); - - if (tid < 32) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); - } - } - }; - template <> struct WarpReductor<32> - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - data[tid] = partial_reduction; - - if (tid < 16) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); - } - } - }; - template <> struct WarpReductor<16> - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - data[tid] = partial_reduction; - - if (tid < 8) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); - } - } - }; - template <> struct WarpReductor<8> - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - data[tid] = partial_reduction; - - if (tid < 4) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); - } - } - }; - - template struct ReductionDispatcher; - template <> struct ReductionDispatcher - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - WarpReductor::reduce(data, partial_reduction, tid, op); - } - }; - template <> struct ReductionDispatcher - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - if (tid < n) - data[tid] = partial_reduction; - __syncthreads(); - - - if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); } - if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); } - if (n >= 128) { if (tid < 64) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 64]); } __syncthreads(); } - - if (tid < 32) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - }; - - /////////////////////////////////////////////////////////////////////////////// - // PredValWarpReductor - - template struct PredValWarpReductor; - template <> struct PredValWarpReductor<64> - { - template - static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - if (tid < 32) - { - myData = sdata[tid]; - myVal = sval[tid]; - - T reg = sdata[tid + 32]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 32]; - } - - reg = sdata[tid + 16]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 16]; - } - - reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - }; - template <> struct PredValWarpReductor<32> - { - template - static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - if (tid < 16) - { - myData = sdata[tid]; - myVal = sval[tid]; - - T reg = sdata[tid + 16]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 16]; - } - - reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - }; - - template <> struct PredValWarpReductor<16> - { - template - static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - if (tid < 8) - { - myData = sdata[tid]; - myVal = sval[tid]; - - T reg = reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - }; - template <> struct PredValWarpReductor<8> - { - template - static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - if (tid < 4) - { - myData = sdata[tid]; - myVal = sval[tid]; - - T reg = reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - }; - - template struct PredValReductionDispatcher; - template <> struct PredValReductionDispatcher - { - template static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - PredValWarpReductor::reduce(myData, myVal, sdata, sval, tid, pred); - } - }; - template <> struct PredValReductionDispatcher - { - template static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - myData = sdata[tid]; - myVal = sval[tid]; - - if (n >= 512 && tid < 256) - { - T reg = sdata[tid + 256]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 256]; - } - __syncthreads(); - } - if (n >= 256 && tid < 128) - { - T reg = sdata[tid + 128]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 128]; - } - __syncthreads(); - } - if (n >= 128 && tid < 64) - { - T reg = sdata[tid + 64]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 64]; - } - __syncthreads(); - } - - if (tid < 32) - { - if (n >= 64) - { - T reg = sdata[tid + 32]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 32]; - } - } - if (n >= 32) - { - T reg = sdata[tid + 16]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 16]; - } - } - if (n >= 16) - { - T reg = sdata[tid + 8]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 8]; - } - } - if (n >= 8) - { - T reg = sdata[tid + 4]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - } - if (n >= 4) - { - T reg = sdata[tid + 2]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - } - if (n >= 2) - { - T reg = sdata[tid + 1]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - } - }; - - /////////////////////////////////////////////////////////////////////////////// - // PredVal2WarpReductor - - template struct PredVal2WarpReductor; - template <> struct PredVal2WarpReductor<64> - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - if (tid < 32) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - T reg = sdata[tid + 32]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 32]; - sval2[tid] = myVal2 = sval2[tid + 32]; - } - - reg = sdata[tid + 16]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 16]; - sval2[tid] = myVal2 = sval2[tid + 16]; - } - - reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 8]; - sval2[tid] = myVal2 = sval2[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - }; - template <> struct PredVal2WarpReductor<32> - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - if (tid < 16) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - T reg = sdata[tid + 16]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 16]; - sval2[tid] = myVal2 = sval2[tid + 16]; - } - - reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 8]; - sval2[tid] = myVal2 = sval2[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - }; - - template <> struct PredVal2WarpReductor<16> - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - if (tid < 8) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - T reg = reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 8]; - sval2[tid] = myVal2 = sval2[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - }; - template <> struct PredVal2WarpReductor<8> - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - if (tid < 4) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - T reg = reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - }; - - template struct PredVal2ReductionDispatcher; - template <> struct PredVal2ReductionDispatcher - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - PredVal2WarpReductor::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred); - } - }; - template <> struct PredVal2ReductionDispatcher - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - if (n >= 512 && tid < 256) - { - T reg = sdata[tid + 256]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 256]; - sval2[tid] = myVal2 = sval2[tid + 256]; - } - __syncthreads(); - } - if (n >= 256 && tid < 128) - { - T reg = sdata[tid + 128]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 128]; - sval2[tid] = myVal2 = sval2[tid + 128]; - } - __syncthreads(); - } - if (n >= 128 && tid < 64) - { - T reg = sdata[tid + 64]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 64]; - sval2[tid] = myVal2 = sval2[tid + 64]; - } - __syncthreads(); - } - - if (tid < 32) - { - if (n >= 64) - { - T reg = sdata[tid + 32]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 32]; - sval2[tid] = myVal2 = sval2[tid + 32]; - } - } - if (n >= 32) - { - T reg = sdata[tid + 16]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 16]; - sval2[tid] = myVal2 = sval2[tid + 16]; - } - } - if (n >= 16) - { - T reg = sdata[tid + 8]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 8]; - sval2[tid] = myVal2 = sval2[tid + 8]; - } - } - if (n >= 8) - { - T reg = sdata[tid + 4]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - } - if (n >= 4) - { - T reg = sdata[tid + 2]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - } - if (n >= 2) - { - T reg = sdata[tid + 1]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - } - }; - } // namespace utility_detail -}}} // namespace cv { namespace gpu { namespace device - -#endif // __OPENCV_GPU_REDUCTION_DETAIL_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/emulation.hpp b/modules/gpu/include/opencv2/gpu/device/emulation.hpp index 074e911275..b6fba230e7 100644 --- a/modules/gpu/include/opencv2/gpu/device/emulation.hpp +++ b/modules/gpu/include/opencv2/gpu/device/emulation.hpp @@ -44,7 +44,6 @@ #define OPENCV_GPU_EMULATION_HPP_ #include "warp_reduce.hpp" -#include namespace cv { namespace gpu { namespace device { diff --git a/modules/gpu/include/opencv2/gpu/device/functional.hpp b/modules/gpu/include/opencv2/gpu/device/functional.hpp index c601cf5273..6064e8e99c 100644 --- a/modules/gpu/include/opencv2/gpu/device/functional.hpp +++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp @@ -302,18 +302,18 @@ namespace cv { namespace gpu { namespace device template <> struct name : binary_function \ { \ __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \ - __device__ __forceinline__ name(const name& other):binary_function(){}\ - __device__ __forceinline__ name():binary_function(){}\ + __device__ __forceinline__ name() {}\ + __device__ __forceinline__ name(const name&) {}\ }; template struct maximum : binary_function { __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const { - return lhs < rhs ? rhs : lhs; + return max(lhs, rhs); } - __device__ __forceinline__ maximum(const maximum& other):binary_function(){} - __device__ __forceinline__ maximum():binary_function(){} + __device__ __forceinline__ maximum() {} + __device__ __forceinline__ maximum(const maximum&) {} }; OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max) @@ -330,10 +330,10 @@ namespace cv { namespace gpu { namespace device { __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const { - return lhs < rhs ? lhs : rhs; + return min(lhs, rhs); } - __device__ __forceinline__ minimum(const minimum& other):binary_function(){} - __device__ __forceinline__ minimum():binary_function(){} + __device__ __forceinline__ minimum() {} + __device__ __forceinline__ minimum(const minimum&) {} }; OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min) @@ -350,6 +350,108 @@ namespace cv { namespace gpu { namespace device // Math functions ///bound========================================= + + template struct abs_func : unary_function + { + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType x) const + { + return abs(x); + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ unsigned char operator ()(unsigned char x) const + { + return x; + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ signed char operator ()(signed char x) const + { + return ::abs((int)x); + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ char operator ()(char x) const + { + return ::abs((int)x); + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ unsigned short operator ()(unsigned short x) const + { + return x; + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ short operator ()(short x) const + { + return ::abs((int)x); + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ unsigned int operator ()(unsigned int x) const + { + return x; + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ int operator ()(int x) const + { + return ::abs(x); + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ float operator ()(float x) const + { + return ::fabsf(x); + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ double operator ()(double x) const + { + return ::fabs(x); + } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} + }; + #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \ template struct name ## _func : unary_function \ { \ @@ -357,6 +459,8 @@ namespace cv { namespace gpu { namespace device { \ return func ## f(v); \ } \ + __device__ __forceinline__ name ## _func() {} \ + __device__ __forceinline__ name ## _func(const name ## _func&) {} \ }; \ template <> struct name ## _func : unary_function \ { \ @@ -364,6 +468,8 @@ namespace cv { namespace gpu { namespace device { \ return func(v); \ } \ + __device__ __forceinline__ name ## _func() {} \ + __device__ __forceinline__ name ## _func(const name ## _func&) {} \ }; #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \ @@ -382,7 +488,6 @@ namespace cv { namespace gpu { namespace device } \ }; - OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2) diff --git a/modules/gpu/include/opencv2/gpu/device/reduce.hpp b/modules/gpu/include/opencv2/gpu/device/reduce.hpp new file mode 100644 index 0000000000..2161b06495 --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/reduce.hpp @@ -0,0 +1,197 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_REDUCE_HPP__ +#define __OPENCV_GPU_REDUCE_HPP__ + +#include +#include "detail/reduce.hpp" +#include "detail/reduce_key_val.hpp" + +namespace cv { namespace gpu { namespace device +{ + template + __device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op) + { + reduce_detail::Dispatcher::reductor::template reduce(smem, val, tid, op); + } + template + __device__ __forceinline__ void reduce(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid, + const thrust::tuple& op) + { + reduce_detail::Dispatcher::reductor::template reduce< + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple&>(smem, val, tid, op); + } + + template + __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce(skeys, key, svals, val, tid, cmp); + } + template + __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, + const thrust::tuple& svals, + const thrust::tuple& val, + unsigned int tid, const Cmp& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce&, + const thrust::tuple&, + const Cmp&>(skeys, key, svals, val, tid, cmp); + } + template + __device__ __forceinline__ void reduceKeyVal(const thrust::tuple& skeys, + const thrust::tuple& key, + const thrust::tuple& svals, + const thrust::tuple& val, + unsigned int tid, + const thrust::tuple& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce< + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple& + >(skeys, key, svals, val, tid, cmp); + } + + // smem_tuple + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0) + { + return thrust::make_tuple((volatile T0*) t0); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9); + } +}}} + +#endif // __OPENCV_GPU_UTILITY_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp index 7bb1da751f..7a2799fa37 100644 --- a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp +++ b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp @@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ uchar saturate_cast(schar v) { - return (uchar) ::max((int)v, 0); - } - template<> __device__ __forceinline__ uchar saturate_cast(ushort v) - { - return (uchar) ::min((uint)v, (uint)UCHAR_MAX); - } - template<> __device__ __forceinline__ uchar saturate_cast(int v) - { - return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); - } - template<> __device__ __forceinline__ uchar saturate_cast(uint v) - { - return (uchar) ::min(v, (uint)UCHAR_MAX); + uint res = 0; + int vi = v; + asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi)); + return res; } template<> __device__ __forceinline__ uchar saturate_cast(short v) { - return saturate_cast((uint)v); + uint res = 0; + asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v)); + return res; + } + template<> __device__ __forceinline__ uchar saturate_cast(ushort v) + { + uint res = 0; + asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v)); + return res; + } + template<> __device__ __forceinline__ uchar saturate_cast(int v) + { + uint res = 0; + asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v)); + return res; + } + template<> __device__ __forceinline__ uchar saturate_cast(uint v) + { + uint res = 0; + asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } - template<> __device__ __forceinline__ uchar saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + uint res = 0; + asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ uchar saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + uint res = 0; + asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif @@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ schar saturate_cast(uchar v) { - return (schar) ::min((int)v, SCHAR_MAX); - } - template<> __device__ __forceinline__ schar saturate_cast(ushort v) - { - return (schar) ::min((uint)v, (uint)SCHAR_MAX); - } - template<> __device__ __forceinline__ schar saturate_cast(int v) - { - return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); + uint res = 0; + uint vi = v; + asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi)); + return res; } template<> __device__ __forceinline__ schar saturate_cast(short v) { - return saturate_cast((int)v); + uint res = 0; + asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v)); + return res; + } + template<> __device__ __forceinline__ schar saturate_cast(ushort v) + { + uint res = 0; + asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v)); + return res; + } + template<> __device__ __forceinline__ schar saturate_cast(int v) + { + uint res = 0; + asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ schar saturate_cast(uint v) { - return (schar) ::min(v, (uint)SCHAR_MAX); + uint res = 0; + asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } - template<> __device__ __forceinline__ schar saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + uint res = 0; + asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ schar saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + uint res = 0; + asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif @@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ ushort saturate_cast(schar v) { - return (ushort) ::max((int)v, 0); + ushort res = 0; + int vi = v; + asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(short v) { - return (ushort) ::max((int)v, 0); + ushort res = 0; + asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(int v) { - return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); + ushort res = 0; + asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(uint v) { - return (ushort) ::min(v, (uint)USHRT_MAX); + ushort res = 0; + asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + ushort res = 0; + asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + ushort res = 0; + asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif @@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ short saturate_cast(ushort v) { - return (short) ::min((int)v, SHRT_MAX); + short res = 0; + asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(int v) { - return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); + short res = 0; + asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(uint v) { - return (short) ::min(v, (uint)SHRT_MAX); + short res = 0; + asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + short res = 0; + asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + short res = 0; + asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif } + template<> __device__ __forceinline__ int saturate_cast(uint v) + { + int res = 0; + asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v)); + return res; + } template<> __device__ __forceinline__ int saturate_cast(float v) { return __float2int_rn(v); @@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device #endif } + template<> __device__ __forceinline__ uint saturate_cast(schar v) + { + uint res = 0; + int vi = v; + asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi)); + return res; + } + template<> __device__ __forceinline__ uint saturate_cast(short v) + { + uint res = 0; + asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v)); + return res; + } + template<> __device__ __forceinline__ uint saturate_cast(int v) + { + uint res = 0; + asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v)); + return res; + } template<> __device__ __forceinline__ uint saturate_cast(float v) { return __float2uint_rn(v); diff --git a/modules/gpu/include/opencv2/gpu/device/utility.hpp b/modules/gpu/include/opencv2/gpu/device/utility.hpp index 88a73a10ea..83eaaa21ce 100644 --- a/modules/gpu/include/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp @@ -45,7 +45,6 @@ #include "saturate_cast.hpp" #include "datamov_utils.hpp" -#include "detail/reduction_detail.hpp" namespace cv { namespace gpu { namespace device { @@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device } }; - /////////////////////////////////////////////////////////////////////////////// - // Reduction - - template __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - StaticAssert= 8 && n <= 512>::check(); - utility_detail::ReductionDispatcher::reduce(data, partial_reduction, tid, op); - } - - template - __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred) - { - StaticAssert= 8 && n <= 512>::check(); - utility_detail::PredValReductionDispatcher::reduce(myData, myVal, sdata, sval, tid, pred); - } - - template - __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred) - { - StaticAssert= 8 && n <= 512>::check(); - utility_detail::PredVal2ReductionDispatcher::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred); - } - /////////////////////////////////////////////////////////////////////////////// // Solve linear system diff --git a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp index b7861bca75..d5b4bb202c 100644 --- a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp +++ b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp @@ -43,7 +43,7 @@ #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__ #define __OPENCV_GPU_VEC_DISTANCE_HPP__ -#include "utility.hpp" +#include "reduce.hpp" #include "functional.hpp" #include "detail/vec_distance_detail.hpp" @@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(int* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator int() const @@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(float* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator float() const @@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(float* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator float() const @@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(int* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator int() const diff --git a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp index 0ec790c0b7..1c46dc0c33 100644 --- a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp +++ b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp @@ -280,7 +280,7 @@ namespace cv { namespace gpu { namespace device OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \ OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \ OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \ - OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \ + OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, abs, abs_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \ @@ -327,4 +327,4 @@ namespace cv { namespace gpu { namespace device #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP }}} // namespace cv { namespace gpu { namespace device -#endif // __OPENCV_GPU_VECMATH_HPP__ \ No newline at end of file +#endif // __OPENCV_GPU_VECMATH_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp new file mode 100644 index 0000000000..8b4479a79b --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp @@ -0,0 +1,145 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__ +#define __OPENCV_GPU_WARP_SHUFFLE_HPP__ + +namespace cv { namespace gpu { namespace device +{ + template + __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return __shfl(val, srcLane, width); + #else + return T(); + #endif + } + __device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return (unsigned int) __shfl((int) val, srcLane, width); + #else + return 0; + #endif + } + __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + int lo = __double2loint(val); + int hi = __double2hiint(val); + + lo = __shfl(lo, srcLane, width); + hi = __shfl(hi, srcLane, width); + + return __hiloint2double(hi, lo); + #else + return 0.0; + #endif + } + + template + __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return __shfl_down(val, delta, width); + #else + return T(); + #endif + } + __device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return (unsigned int) __shfl_down((int) val, delta, width); + #else + return 0; + #endif + } + __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + int lo = __double2loint(val); + int hi = __double2hiint(val); + + lo = __shfl_down(lo, delta, width); + hi = __shfl_down(hi, delta, width); + + return __hiloint2double(hi, lo); + #else + return 0.0; + #endif + } + + template + __device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return __shfl_up(val, delta, width); + #else + return T(); + #endif + } + __device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return (unsigned int) __shfl_up((int) val, delta, width); + #else + return 0; + #endif + } + __device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + int lo = __double2loint(val); + int hi = __double2hiint(val); + + lo = __shfl_up(lo, delta, width); + hi = __shfl_up(hi, delta, width); + + return __hiloint2double(hi, lo); + #else + return 0.0; + #endif + } +}}} + +#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index ddb131788f..c6ce2faff3 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -792,31 +792,23 @@ private: GpuMat lab, l, ab; }; +struct CV_EXPORTS CannyBuf +{ + void create(const Size& image_size, int apperture_size = 3); + void release(); -struct CV_EXPORTS CannyBuf; + GpuMat dx, dy; + GpuMat mag; + GpuMat map; + GpuMat st1, st2; + Ptr filterDX, filterDY; +}; CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); -struct CV_EXPORTS CannyBuf -{ - CannyBuf() {} - explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);} - CannyBuf(const GpuMat& dx_, const GpuMat& dy_); - - void create(const Size& image_size, int apperture_size = 3); - - void release(); - - GpuMat dx, dy; - GpuMat dx_buf, dy_buf; - GpuMat edgeBuf; - GpuMat trackBuf1, trackBuf2; - Ptr filterDX, filterDY; -}; - class CV_EXPORTS ImagePyramid { public: @@ -855,6 +847,11 @@ CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float th CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); CV_EXPORTS void HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_votes = noArray()); +//! HoughLinesP + +//! finds line segments in the black-n-white image using probabalistic Hough transform +CV_EXPORTS void HoughLinesP(const GpuMat& image, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int minLineLength, int maxLineGap, int maxLines = 4096); + //! HoughCircles struct HoughCirclesBuf @@ -1036,11 +1033,9 @@ CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels //! Calculates histogram for 8u one channel image //! Output hist will have one row, 256 cols and CV32SC1 type. CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null()); -CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); //! normalizes the grayscale image brightness and contrast by normalizing its histogram CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); -CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null()); CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); //////////////////////////////// StereoBM_GPU //////////////////////////////// @@ -1532,6 +1527,97 @@ public: int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4); }; +// ======================== GPU version for soft cascade ===================== // + +class CV_EXPORTS ChannelsProcessor +{ +public: + enum + { + GENERIC = 1 << 4, + SEPARABLE = 2 << 4 + }; + + // Appends specified number of HOG first-order features integrals into given vector. + // Param frame is an input 3-channel bgr image. + // Param channels is a GPU matrix of optionally shrinked channels + // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution. + virtual void apply(InputArray frame, OutputArray channels, Stream& stream = Stream::Null()) = 0; + + // Creates a specific preprocessor implementation. + // Param shrinkage is a resizing factor. Resize is applied before the computing integral sum + // Param bins is a number of HOG-like channels. + // Param flags is a channel computing extra flags. + static cv::Ptr create(const int shrinkage, const int bins, const int flags = GENERIC); + + virtual ~ChannelsProcessor(); + +protected: + ChannelsProcessor(); +}; + +// Implementation of soft (stageless) cascaded detector. +class CV_EXPORTS SCascade : public Algorithm +{ +public: + + // Representation of detectors result. + struct CV_EXPORTS Detection + { + ushort x; + ushort y; + ushort w; + ushort h; + float confidence; + int kind; + + enum {PEDESTRIAN = 0}; + }; + + enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT, NMS_MASK = 0xF}; + + // An empty cascade will be created. + // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed. + // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed. + // Param scales is a number of scales from minScale to maxScale. + // Param flags is an extra tuning flags. + SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, + const int flags = NO_REJECT || ChannelsProcessor::GENERIC); + + virtual ~SCascade(); + + cv::AlgorithmInfo* info() const; + + // Load cascade from FileNode. + // Param fn is a root node for cascade. Should be . + virtual bool load(const FileNode& fn); + + // Load cascade config. + virtual void read(const FileNode& fn); + + // Return the matrix of of detectioned objects. + // Param image is a frame on which detector will be applied. + // Param rois is a regions of interests mask generated by genRoi. + // Only the objects that fall into one of the regions will be returned. + // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection) + // The first element of the matrix is actually a count of detections. + // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution + virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const; + +private: + + struct Fields; + Fields* fields; + + double minScale; + double maxScale; + int scales; + + int flags; +}; + +CV_EXPORTS bool initModule_gpu(void); + ////////////////////////////////// SURF ////////////////////////////////////////// class CV_EXPORTS SURF_GPU @@ -1877,8 +1963,6 @@ private: GpuMat uPyr_[2]; GpuMat vPyr_[2]; - - bool isDeviceArch11_; }; @@ -1895,7 +1979,6 @@ public: polyN = 5; polySigma = 1.1; flags = 0; - isDeviceArch11_ = !DeviceInfo().supports(FEATURE_SET_COMPUTE_12); } int numLevels; @@ -1943,8 +2026,113 @@ private: GpuMat frames_[2]; GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2]; std::vector pyramid0_, pyramid1_; +}; - bool isDeviceArch11_; + +// Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method +// +// see reference: +// [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow". +// [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation". +class CV_EXPORTS OpticalFlowDual_TVL1_GPU +{ +public: + OpticalFlowDual_TVL1_GPU(); + + void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy); + + void collectGarbage(); + + /** + * Time step of the numerical scheme. + */ + double tau; + + /** + * Weight parameter for the data term, attachment parameter. + * This is the most relevant parameter, which determines the smoothness of the output. + * The smaller this parameter is, the smoother the solutions we obtain. + * It depends on the range of motions of the images, so its value should be adapted to each image sequence. + */ + double lambda; + + /** + * Weight parameter for (u - v)^2, tightness parameter. + * It serves as a link between the attachment and the regularization terms. + * In theory, it should have a small value in order to maintain both parts in correspondence. + * The method is stable for a large range of values of this parameter. + */ + double theta; + + /** + * Number of scales used to create the pyramid of images. + */ + int nscales; + + /** + * Number of warpings per scale. + * Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale. + * This is a parameter that assures the stability of the method. + * It also affects the running time, so it is a compromise between speed and accuracy. + */ + int warps; + + /** + * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time. + * A small value will yield more accurate solutions at the expense of a slower convergence. + */ + double epsilon; + + /** + * Stopping criterion iterations number used in the numerical scheme. + */ + int iterations; + + bool useInitialFlow; + +private: + void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2); + + std::vector I0s; + std::vector I1s; + std::vector u1s; + std::vector u2s; + + GpuMat I1x_buf; + GpuMat I1y_buf; + + GpuMat I1w_buf; + GpuMat I1wx_buf; + GpuMat I1wy_buf; + + GpuMat grad_buf; + GpuMat rho_c_buf; + + GpuMat p11_buf; + GpuMat p12_buf; + GpuMat p21_buf; + GpuMat p22_buf; + + GpuMat diff_buf; + GpuMat norm_buf; +}; + + +//! Calculates optical flow for 2 images using block matching algorithm */ +CV_EXPORTS void calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr, + Size block_size, Size shift_size, Size max_range, bool use_previous, + GpuMat& velx, GpuMat& vely, GpuMat& buf, + Stream& stream = Stream::Null()); + +class CV_EXPORTS FastOpticalFlowBM +{ +public: + void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window = 21, int block_window = 7, Stream& s = Stream::Null()); + +private: + GpuMat buffer; + GpuMat extended_I0; + GpuMat extended_I1; }; diff --git a/modules/gpu/misc/carma.toolchain.cmake b/modules/gpu/misc/carma.toolchain.cmake new file mode 100644 index 0000000000..18f0e0f934 --- /dev/null +++ b/modules/gpu/misc/carma.toolchain.cmake @@ -0,0 +1,26 @@ +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_VERSION 1) +set(CMAKE_SYSTEM_PROCESSOR arm) + +set(CMAKE_C_COMPILER arm-linux-gnueabi-gcc-4.5) +set(CMAKE_CXX_COMPILER arm-linux-gnueabi-g++-4.5) + +#suppress compiller varning +set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-psabi" ) +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-psabi" ) + +# can be any other plases +set(__arm_linux_eabi_root /usr/arm-linux-gnueabi) + +set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${__arm_linux_eabi_root}) + +if(EXISTS ${CUDA_TOOLKIT_ROOT_DIR}) + set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${CUDA_TOOLKIT_ROOT_DIR}) +endif() + +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) + +set(CARMA 1) +add_definitions(-DCARMA) diff --git a/modules/gpu/perf/perf_core.cpp b/modules/gpu/perf/perf_core.cpp index 725bb9b3d3..ad722fa3b5 100644 --- a/modules/gpu/perf/perf_core.cpp +++ b/modules/gpu/perf/perf_core.cpp @@ -28,27 +28,17 @@ PERF_TEST_P(Sz_Depth_Cn, Core_Merge, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT_D cv::gpu::GpuMat d_dst; - cv::gpu::merge(d_src, d_dst); + TEST_CYCLE() cv::gpu::merge(d_src, d_dst); - TEST_CYCLE() - { - cv::gpu::merge(d_src, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-12); } else { cv::Mat dst; - cv::merge(src, dst); + TEST_CYCLE() cv::merge(src, dst); - TEST_CYCLE() - { - cv::merge(src, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-12); } } @@ -69,28 +59,18 @@ PERF_TEST_P(Sz_Depth_Cn, Core_Split, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT_D std::vector d_dst; - cv::gpu::split(d_src, d_dst); - - TEST_CYCLE() - { - cv::gpu::split(d_src, d_dst); - } + TEST_CYCLE() cv::gpu::split(d_src, d_dst); cv::gpu::GpuMat first = d_dst[0]; - GPU_SANITY_CHECK(first); + GPU_SANITY_CHECK(first, 1e-12); } else { std::vector dst; - cv::split(src, dst); + TEST_CYCLE() cv::split(src, dst); - TEST_CYCLE() - { - cv::split(src, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-12); } } @@ -114,27 +94,17 @@ PERF_TEST_P(Sz_Depth, Core_AddMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT_DEP cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::add(d_src1, d_src2, d_dst); + TEST_CYCLE() cv::gpu::add(d_src1, d_src2, d_dst); - TEST_CYCLE() - { - cv::gpu::add(d_src1, d_src2, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::add(src1, src2, dst); + TEST_CYCLE() cv::add(src1, src2, dst); - TEST_CYCLE() - { - cv::add(src1, src2, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -156,27 +126,17 @@ PERF_TEST_P(Sz_Depth, Core_AddScalar, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT_ cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::add(d_src, s, d_dst); + TEST_CYCLE() cv::gpu::add(d_src, s, d_dst); - TEST_CYCLE() - { - cv::gpu::add(d_src, s, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::add(src, s, dst); + TEST_CYCLE() cv::add(src, s, dst); - TEST_CYCLE() - { - cv::add(src, s, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -200,27 +160,17 @@ PERF_TEST_P(Sz_Depth, Core_SubtractMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MA cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::subtract(d_src1, d_src2, d_dst); + TEST_CYCLE() cv::gpu::subtract(d_src1, d_src2, d_dst); - TEST_CYCLE() - { - cv::gpu::subtract(d_src1, d_src2, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::subtract(src1, src2, dst); + TEST_CYCLE() cv::subtract(src1, src2, dst); - TEST_CYCLE() - { - cv::subtract(src1, src2, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -242,27 +192,17 @@ PERF_TEST_P(Sz_Depth, Core_SubtractScalar, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::subtract(d_src, s, d_dst); + TEST_CYCLE() cv::gpu::subtract(d_src, s, d_dst); - TEST_CYCLE() - { - cv::gpu::subtract(d_src, s, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::subtract(src, s, dst); + TEST_CYCLE() cv::subtract(src, s, dst); - TEST_CYCLE() - { - cv::subtract(src, s, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -286,27 +226,17 @@ PERF_TEST_P(Sz_Depth, Core_MultiplyMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MA cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::multiply(d_src1, d_src2, d_dst); + TEST_CYCLE() cv::gpu::multiply(d_src1, d_src2, d_dst); - TEST_CYCLE() - { - cv::gpu::multiply(d_src1, d_src2, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::multiply(src1, src2, dst); + TEST_CYCLE() cv::multiply(src1, src2, dst); - TEST_CYCLE() - { - cv::multiply(src1, src2, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -330,25 +260,17 @@ PERF_TEST_P(Sz_Depth, Core_MultiplyScalar, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM cv::gpu::multiply(d_src, s, d_dst); - TEST_CYCLE() - { - cv::gpu::multiply(d_src, s, d_dst); - } + TEST_CYCLE() cv::gpu::multiply(d_src, s, d_dst); - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::multiply(src, s, dst); + TEST_CYCLE() cv::multiply(src, s, dst); - TEST_CYCLE() - { - cv::multiply(src, s, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -372,27 +294,17 @@ PERF_TEST_P(Sz_Depth, Core_DivideMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT_ cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::divide(d_src1, d_src2, d_dst); + TEST_CYCLE() cv::gpu::divide(d_src1, d_src2, d_dst); - TEST_CYCLE() - { - cv::gpu::divide(d_src1, d_src2, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::divide(src1, src2, dst); + TEST_CYCLE() cv::divide(src1, src2, dst); - TEST_CYCLE() - { - cv::divide(src1, src2, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -414,27 +326,17 @@ PERF_TEST_P(Sz_Depth, Core_DivideScalar, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_M cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::divide(d_src, s, d_dst); + TEST_CYCLE() cv::gpu::divide(d_src, s, d_dst); - TEST_CYCLE() - { - cv::gpu::divide(d_src, s, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::divide(src, s, dst); + TEST_CYCLE() cv::divide(src, s, dst); - TEST_CYCLE() - { - cv::divide(src, s, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -456,27 +358,17 @@ PERF_TEST_P(Sz_Depth, Core_DivideScalarInv, Combine(GPU_TYPICAL_MAT_SIZES, ARITH cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::divide(s, d_src, d_dst); + TEST_CYCLE() cv::gpu::divide(s, d_src, d_dst); - TEST_CYCLE() - { - cv::gpu::divide(s, d_src, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::divide(s, src, dst); + TEST_CYCLE() cv::divide(s, src, dst); - TEST_CYCLE() - { - cv::divide(s, src, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -500,27 +392,17 @@ PERF_TEST_P(Sz_Depth, Core_AbsDiffMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::absdiff(d_src1, d_src2, d_dst); + TEST_CYCLE() cv::gpu::absdiff(d_src1, d_src2, d_dst); - TEST_CYCLE() - { - cv::gpu::absdiff(d_src1, d_src2, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::absdiff(src1, src2, dst); + TEST_CYCLE() cv::absdiff(src1, src2, dst); - TEST_CYCLE() - { - cv::absdiff(src1, src2, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -542,27 +424,17 @@ PERF_TEST_P(Sz_Depth, Core_AbsDiffScalar, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_ cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::absdiff(d_src, s, d_dst); + TEST_CYCLE() cv::gpu::absdiff(d_src, s, d_dst); - TEST_CYCLE() - { - cv::gpu::absdiff(d_src, s, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::absdiff(src, s, dst); + TEST_CYCLE() cv::absdiff(src, s, dst); - TEST_CYCLE() - { - cv::absdiff(src, s, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -582,19 +454,11 @@ PERF_TEST_P(Sz_Depth, Core_Abs, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_16S, CV cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::abs(d_src, d_dst); + TEST_CYCLE() cv::gpu::abs(d_src, d_dst); - TEST_CYCLE() - { - cv::gpu::abs(d_src, d_dst); - } - - GPU_SANITY_CHECK(d_dst); - } - else - { - FAIL() << "No such CPU implementation analogy"; + GPU_SANITY_CHECK(d_dst, 1e-8); } + else FAIL_NO_CPU(); } ////////////////////////////////////////////////////////////////////// @@ -613,19 +477,11 @@ PERF_TEST_P(Sz_Depth, Core_Sqr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_ cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::sqr(d_src, d_dst); + TEST_CYCLE() cv::gpu::sqr(d_src, d_dst); - TEST_CYCLE() - { - cv::gpu::sqr(d_src, d_dst); - } - - GPU_SANITY_CHECK(d_dst); - } - else - { - FAIL() << "No such CPU implementation analogy"; + GPU_SANITY_CHECK(d_dst, 1e-8); } + else FAIL_NO_CPU(); } ////////////////////////////////////////////////////////////////////// @@ -644,27 +500,17 @@ PERF_TEST_P(Sz_Depth, Core_Sqrt, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::sqrt(d_src, d_dst); + TEST_CYCLE() cv::gpu::sqrt(d_src, d_dst); - TEST_CYCLE() - { - cv::gpu::sqrt(d_src, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::sqrt(src, dst); + TEST_CYCLE() cv::sqrt(src, dst); - TEST_CYCLE() - { - cv::sqrt(src, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -684,27 +530,17 @@ PERF_TEST_P(Sz_Depth, Core_Log, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_ cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::log(d_src, d_dst); + TEST_CYCLE() cv::gpu::log(d_src, d_dst); - TEST_CYCLE() - { - cv::gpu::log(d_src, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::log(src, dst); + TEST_CYCLE() cv::log(src, dst); - TEST_CYCLE() - { - cv::log(src, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -724,27 +560,17 @@ PERF_TEST_P(Sz_Depth, Core_Exp, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_ cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::exp(d_src, d_dst); + TEST_CYCLE() cv::gpu::exp(d_src, d_dst); - TEST_CYCLE() - { - cv::gpu::exp(d_src, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::exp(src, dst); + TEST_CYCLE() TEST_CYCLE() cv::exp(src, dst); - TEST_CYCLE() - { - cv::exp(src, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -767,27 +593,17 @@ PERF_TEST_P(Sz_Depth_Power, Core_Pow, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8 cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::pow(d_src, power, d_dst); + TEST_CYCLE() cv::gpu::pow(d_src, power, d_dst); - TEST_CYCLE() - { - cv::gpu::pow(d_src, power, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::pow(src, power, dst); + TEST_CYCLE() cv::pow(src, power,dst); - TEST_CYCLE() - { - cv::pow(src, power, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -817,12 +633,7 @@ PERF_TEST_P(Sz_Depth_Code, Core_CompareMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITH cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::compare(d_src1, d_src2, d_dst, cmp_code); - - TEST_CYCLE() - { - cv::gpu::compare(d_src1, d_src2, d_dst, cmp_code); - } + TEST_CYCLE() cv::gpu::compare(d_src1, d_src2, d_dst, cmp_code); GPU_SANITY_CHECK(d_dst); } @@ -830,12 +641,7 @@ PERF_TEST_P(Sz_Depth_Code, Core_CompareMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITH { cv::Mat dst; - cv::compare(src1, src2, dst, cmp_code); - - TEST_CYCLE() - { - cv::compare(src1, src2, dst, cmp_code); - } + TEST_CYCLE() cv::compare(src1, src2, dst, cmp_code); CPU_SANITY_CHECK(dst); } @@ -860,12 +666,7 @@ PERF_TEST_P(Sz_Depth_Code, Core_CompareScalar, Combine(GPU_TYPICAL_MAT_SIZES, AR cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::compare(d_src, s, d_dst, cmp_code); - - TEST_CYCLE() - { - cv::gpu::compare(d_src, s, d_dst, cmp_code); - } + TEST_CYCLE() cv::gpu::compare(d_src, s, d_dst, cmp_code); GPU_SANITY_CHECK(d_dst); } @@ -873,12 +674,7 @@ PERF_TEST_P(Sz_Depth_Code, Core_CompareScalar, Combine(GPU_TYPICAL_MAT_SIZES, AR { cv::Mat dst; - cv::compare(src, s, dst, cmp_code); - - TEST_CYCLE() - { - cv::compare(src, s, dst, cmp_code); - } + TEST_CYCLE() cv::compare(src, s, dst, cmp_code); CPU_SANITY_CHECK(dst); } @@ -900,12 +696,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseNot, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_ cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::bitwise_not(d_src, d_dst); - - TEST_CYCLE() - { - cv::gpu::bitwise_not(d_src, d_dst); - } + TEST_CYCLE() cv::gpu::bitwise_not(d_src,d_dst); GPU_SANITY_CHECK(d_dst); } @@ -913,12 +704,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseNot, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_ { cv::Mat dst; - cv::bitwise_not(src, dst); - - TEST_CYCLE() - { - cv::bitwise_not(src, dst); - } + TEST_CYCLE() cv::bitwise_not(src,dst); CPU_SANITY_CHECK(dst); } @@ -944,12 +730,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseAndMat, Combine(GPU_TYPICAL_MAT_SIZES, Values( cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::bitwise_and(d_src1, d_src2, d_dst); - - TEST_CYCLE() - { - cv::gpu::bitwise_and(d_src1, d_src2, d_dst); - } + TEST_CYCLE() cv::gpu::bitwise_and(d_src1, d_src2,d_dst); GPU_SANITY_CHECK(d_dst); } @@ -957,12 +738,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseAndMat, Combine(GPU_TYPICAL_MAT_SIZES, Values( { cv::Mat dst; - cv::bitwise_and(src1, src2, dst); - - TEST_CYCLE() - { - cv::bitwise_and(src1, src2, dst); - } + TEST_CYCLE() cv::bitwise_and(src1, src2,dst); } } @@ -987,12 +763,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseAndScalar, Combine(GPU_TYPICAL_MAT_SIZES, V cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::bitwise_and(d_src, s, d_dst); - - TEST_CYCLE() - { - cv::gpu::bitwise_and(d_src, s, d_dst); - } + TEST_CYCLE() cv::gpu::bitwise_and(d_src, s,d_dst); GPU_SANITY_CHECK(d_dst); } @@ -1000,12 +771,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseAndScalar, Combine(GPU_TYPICAL_MAT_SIZES, V { cv::Mat dst; - cv::bitwise_and(src, s, dst); - - TEST_CYCLE() - { - cv::bitwise_and(src, s, dst); - } + TEST_CYCLE() cv::bitwise_and(src, s,dst); CPU_SANITY_CHECK(dst); } @@ -1031,12 +797,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseOrMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(C cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::bitwise_or(d_src1, d_src2, d_dst); - - TEST_CYCLE() - { - cv::gpu::bitwise_or(d_src1, d_src2, d_dst); - } + TEST_CYCLE() cv::gpu::bitwise_or(d_src1, d_src2,d_dst); GPU_SANITY_CHECK(d_dst); } @@ -1044,12 +805,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseOrMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(C { cv::Mat dst; - cv::bitwise_or(src1, src2, dst); - - TEST_CYCLE() - { - cv::bitwise_or(src1, src2, dst); - } + TEST_CYCLE() cv::bitwise_or(src1, src2,dst); CPU_SANITY_CHECK(dst); } @@ -1076,12 +832,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseOrScalar, Combine(GPU_TYPICAL_MAT_SIZES, Va cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::bitwise_or(d_src, s, d_dst); - - TEST_CYCLE() - { - cv::gpu::bitwise_or(d_src, s, d_dst); - } + TEST_CYCLE() cv::gpu::bitwise_or(d_src, s,d_dst); GPU_SANITY_CHECK(d_dst); } @@ -1089,12 +840,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseOrScalar, Combine(GPU_TYPICAL_MAT_SIZES, Va { cv::Mat dst; - cv::bitwise_or(src, s, dst); - - TEST_CYCLE() - { - cv::bitwise_or(src, s, dst); - } + TEST_CYCLE() cv::bitwise_or(src, s,dst); CPU_SANITY_CHECK(dst); } @@ -1120,12 +866,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseXorMat, Combine(GPU_TYPICAL_MAT_SIZES, Values( cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::bitwise_xor(d_src1, d_src2, d_dst); - - TEST_CYCLE() - { - cv::gpu::bitwise_xor(d_src1, d_src2, d_dst); - } + TEST_CYCLE() cv::gpu::bitwise_xor(d_src1, d_src2,d_dst); GPU_SANITY_CHECK(d_dst); } @@ -1133,12 +874,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseXorMat, Combine(GPU_TYPICAL_MAT_SIZES, Values( { cv::Mat dst; - cv::bitwise_xor(src1, src2, dst); - - TEST_CYCLE() - { - cv::bitwise_xor(src1, src2, dst); - } + TEST_CYCLE() cv::bitwise_xor(src1, src2,dst); } } @@ -1163,12 +899,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseXorScalar, Combine(GPU_TYPICAL_MAT_SIZES, V cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::bitwise_xor(d_src, s, d_dst); - - TEST_CYCLE() - { - cv::gpu::bitwise_xor(d_src, s, d_dst); - } + TEST_CYCLE() cv::gpu::bitwise_xor(d_src, s,d_dst); GPU_SANITY_CHECK(d_dst); } @@ -1176,12 +907,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseXorScalar, Combine(GPU_TYPICAL_MAT_SIZES, V { cv::Mat dst; - cv::bitwise_xor(src, s, dst); - - TEST_CYCLE() - { - cv::bitwise_xor(src, s, dst); - } + TEST_CYCLE() cv::bitwise_xor(src, s,dst); CPU_SANITY_CHECK(dst); } @@ -1208,18 +934,13 @@ PERF_TEST_P(Sz_Depth_Cn, Core_RShift, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8 cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::rshift(d_src, val, d_dst); - - TEST_CYCLE() - { - cv::gpu::rshift(d_src, val, d_dst); - } + TEST_CYCLE() cv::gpu::rshift(d_src, val,d_dst); GPU_SANITY_CHECK(d_dst); } else { - FAIL() << "No such CPU implementation analogy"; + FAIL_NO_CPU(); } } @@ -1244,18 +965,13 @@ PERF_TEST_P(Sz_Depth_Cn, Core_LShift, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8 cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::lshift(d_src, val, d_dst); - - TEST_CYCLE() - { - cv::gpu::lshift(d_src, val, d_dst); - } + TEST_CYCLE() cv::gpu::lshift(d_src, val,d_dst); GPU_SANITY_CHECK(d_dst); } else { - FAIL() << "No such CPU implementation analogy"; + FAIL_NO_CPU(); } } @@ -1279,12 +995,7 @@ PERF_TEST_P(Sz_Depth, Core_MinMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::min(d_src1, d_src2, d_dst); - - TEST_CYCLE() - { - cv::gpu::min(d_src1, d_src2, d_dst); - } + TEST_CYCLE() cv::gpu::min(d_src1, d_src2,d_dst); GPU_SANITY_CHECK(d_dst); } @@ -1292,12 +1003,7 @@ PERF_TEST_P(Sz_Depth, Core_MinMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, { cv::Mat dst; - cv::min(src1, src2, dst); - - TEST_CYCLE() - { - cv::min(src1, src2, dst); - } + TEST_CYCLE() cv::min(src1, src2,dst); CPU_SANITY_CHECK(dst); } @@ -1321,12 +1027,7 @@ PERF_TEST_P(Sz_Depth, Core_MinScalar, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8 cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::min(d_src, val, d_dst); - - TEST_CYCLE() - { - cv::gpu::min(d_src, val, d_dst); - } + TEST_CYCLE() cv::gpu::min(d_src, val,d_dst); GPU_SANITY_CHECK(d_dst); } @@ -1334,12 +1035,7 @@ PERF_TEST_P(Sz_Depth, Core_MinScalar, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8 { cv::Mat dst; - cv::min(src, val, dst); - - TEST_CYCLE() - { - cv::min(src, val, dst); - } + TEST_CYCLE() cv::min(src, val,dst); CPU_SANITY_CHECK(dst); } @@ -1365,12 +1061,7 @@ PERF_TEST_P(Sz_Depth, Core_MaxMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::max(d_src1, d_src2, d_dst); - - TEST_CYCLE() - { - cv::gpu::max(d_src1, d_src2, d_dst); - } + TEST_CYCLE() cv::gpu::max(d_src1, d_src2,d_dst); GPU_SANITY_CHECK(d_dst); } @@ -1378,12 +1069,7 @@ PERF_TEST_P(Sz_Depth, Core_MaxMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, { cv::Mat dst; - cv::max(src1, src2, dst); - - TEST_CYCLE() - { - cv::max(src1, src2, dst); - } + TEST_CYCLE() cv::max(src1, src2,dst); CPU_SANITY_CHECK(dst); } @@ -1407,12 +1093,7 @@ PERF_TEST_P(Sz_Depth, Core_MaxScalar, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8 cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::max(d_src, val, d_dst); - - TEST_CYCLE() - { - cv::gpu::max(d_src, val, d_dst); - } + TEST_CYCLE() cv::gpu::max(d_src, val,d_dst); GPU_SANITY_CHECK(d_dst); } @@ -1420,12 +1101,7 @@ PERF_TEST_P(Sz_Depth, Core_MaxScalar, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8 { cv::Mat dst; - cv::max(src, val, dst); - - TEST_CYCLE() - { - cv::max(src, val, dst); - } + TEST_CYCLE() cv::max(src, val,dst); CPU_SANITY_CHECK(dst); } @@ -1459,12 +1135,7 @@ PERF_TEST_P(Sz_3Depth, Core_AddWeighted, Combine( cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::addWeighted(d_src1, 0.5, d_src2, 0.5, 10.0, d_dst, dst_depth); - - TEST_CYCLE() - { - cv::gpu::addWeighted(d_src1, 0.5, d_src2, 0.5, 10.0, d_dst, dst_depth); - } + TEST_CYCLE() cv::gpu::addWeighted(d_src1, 0.5, d_src2, 0.5, 10.0, d_dst, dst_depth); GPU_SANITY_CHECK(d_dst); } @@ -1472,12 +1143,7 @@ PERF_TEST_P(Sz_3Depth, Core_AddWeighted, Combine( { cv::Mat dst; - cv::addWeighted(src1, 0.5, src2, 0.5, 10.0, dst, dst_depth); - - TEST_CYCLE() - { - cv::addWeighted(src1, 0.5, src2, 0.5, 10.0, dst, dst_depth); - } + TEST_CYCLE() cv::addWeighted(src1, 0.5, src2, 0.5, 10.0, dst, dst_depth); CPU_SANITY_CHECK(dst); } @@ -1518,29 +1184,19 @@ PERF_TEST_P(Sz_Type_Flags, Core_GEMM, Combine( cv::gpu::GpuMat d_src3(src3); cv::gpu::GpuMat d_dst; - cv::gpu::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst, flags); + TEST_CYCLE() cv::gpu::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst, flags); - TEST_CYCLE() - { - cv::gpu::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst, flags); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::gemm(src1, src2, 1.0, src3, 1.0, dst, flags); - declare.time(50.0); - TEST_CYCLE() - { - cv::gemm(src1, src2, 1.0, src3, 1.0, dst, flags); - } + TEST_CYCLE() cv::gemm(src1, src2, 1.0, src3, 1.0, dst, flags); - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -1562,12 +1218,7 @@ PERF_TEST_P(Sz_Type, Core_Transpose, Combine( cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::transpose(d_src, d_dst); - - TEST_CYCLE() - { - cv::gpu::transpose(d_src, d_dst); - } + TEST_CYCLE() cv::gpu::transpose(d_src,d_dst); GPU_SANITY_CHECK(d_dst); } @@ -1575,12 +1226,7 @@ PERF_TEST_P(Sz_Type, Core_Transpose, Combine( { cv::Mat dst; - cv::transpose(src, dst); - - TEST_CYCLE() - { - cv::transpose(src, dst); - } + TEST_CYCLE() cv::transpose(src,dst); CPU_SANITY_CHECK(dst); } @@ -1616,12 +1262,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code, Core_Flip, Combine( cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::flip(d_src, d_dst, flipCode); - - TEST_CYCLE() - { - cv::gpu::flip(d_src, d_dst, flipCode); - } + TEST_CYCLE() cv::gpu::flip(d_src, d_dst, flipCode); GPU_SANITY_CHECK(d_dst); } @@ -1629,12 +1270,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code, Core_Flip, Combine( { cv::Mat dst; - cv::flip(src, dst, flipCode); - - TEST_CYCLE() - { - cv::flip(src, dst, flipCode); - } + TEST_CYCLE() cv::flip(src, dst, flipCode); CPU_SANITY_CHECK(dst); } @@ -1661,12 +1297,7 @@ PERF_TEST_P(Sz_Type, Core_LutOneChannel, Combine( cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::LUT(d_src, lut, d_dst); - - TEST_CYCLE() - { - cv::gpu::LUT(d_src, lut, d_dst); - } + TEST_CYCLE() cv::gpu::LUT(d_src, lut,d_dst); GPU_SANITY_CHECK(d_dst); } @@ -1674,12 +1305,7 @@ PERF_TEST_P(Sz_Type, Core_LutOneChannel, Combine( { cv::Mat dst; - cv::LUT(src, lut, dst); - - TEST_CYCLE() - { - cv::LUT(src, lut, dst); - } + TEST_CYCLE() cv::LUT(src, lut, dst); CPU_SANITY_CHECK(dst); } @@ -1706,12 +1332,7 @@ PERF_TEST_P(Sz_Type, Core_LutMultiChannel, Combine( cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::LUT(d_src, lut, d_dst); - - TEST_CYCLE() - { - cv::gpu::LUT(d_src, lut, d_dst); - } + TEST_CYCLE() cv::gpu::LUT(d_src, lut,d_dst); GPU_SANITY_CHECK(d_dst); } @@ -1719,12 +1340,7 @@ PERF_TEST_P(Sz_Type, Core_LutMultiChannel, Combine( { cv::Mat dst; - cv::LUT(src, lut, dst); - - TEST_CYCLE() - { - cv::LUT(src, lut, dst); - } + TEST_CYCLE() cv::LUT(src, lut, dst); CPU_SANITY_CHECK(dst); } @@ -1745,14 +1361,9 @@ PERF_TEST_P(Sz, Core_MagnitudeComplex, GPU_TYPICAL_MAT_SIZES) cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::magnitude(d_src, d_dst); + TEST_CYCLE() cv::gpu::magnitude(d_src,d_dst); - TEST_CYCLE() - { - cv::gpu::magnitude(d_src, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { @@ -1761,14 +1372,9 @@ PERF_TEST_P(Sz, Core_MagnitudeComplex, GPU_TYPICAL_MAT_SIZES) cv::Mat dst; - cv::magnitude(xy[0], xy[1], dst); + TEST_CYCLE() cv::magnitude(xy[0], xy[1], dst); - TEST_CYCLE() - { - cv::magnitude(xy[0], xy[1], dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -1787,18 +1393,13 @@ PERF_TEST_P(Sz, Core_MagnitudeSqrComplex, GPU_TYPICAL_MAT_SIZES) cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::magnitudeSqr(d_src, d_dst); - - TEST_CYCLE() - { - cv::gpu::magnitudeSqr(d_src, d_dst); - } + TEST_CYCLE() cv::gpu::magnitudeSqr(d_src, d_dst); GPU_SANITY_CHECK(d_dst); } else { - FAIL() << "No such CPU implementation analogy"; + FAIL_NO_CPU(); } } @@ -1821,27 +1422,17 @@ PERF_TEST_P(Sz, Core_Magnitude, GPU_TYPICAL_MAT_SIZES) cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::magnitude(d_src1, d_src2, d_dst); + TEST_CYCLE() cv::gpu::magnitude(d_src1, d_src2, d_dst); - TEST_CYCLE() - { - cv::gpu::magnitude(d_src1, d_src2, d_dst); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::magnitude(src1, src2, dst); + TEST_CYCLE() cv::magnitude(src1, src2, dst); - TEST_CYCLE() - { - cv::magnitude(src1, src2, dst); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -1865,18 +1456,13 @@ PERF_TEST_P(Sz, Core_MagnitudeSqr, GPU_TYPICAL_MAT_SIZES) cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::magnitudeSqr(d_src1, d_src2, d_dst); - - TEST_CYCLE() - { - cv::gpu::magnitudeSqr(d_src1, d_src2, d_dst); - } + TEST_CYCLE() cv::gpu::magnitudeSqr(d_src1, d_src2, d_dst); GPU_SANITY_CHECK(d_dst); } else { - FAIL() << "No such CPU implementation analogy"; + FAIL_NO_CPU(); } } @@ -1902,27 +1488,17 @@ PERF_TEST_P(Sz_AngleInDegrees, Core_Phase, Combine(GPU_TYPICAL_MAT_SIZES, Bool() cv::gpu::GpuMat d_src2(src2); cv::gpu::GpuMat d_dst; - cv::gpu::phase(d_src1, d_src2, d_dst, angleInDegrees); + TEST_CYCLE() cv::gpu::phase(d_src1, d_src2, d_dst, angleInDegrees); - TEST_CYCLE() - { - cv::gpu::phase(d_src1, d_src2, d_dst, angleInDegrees); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1e-8); } else { cv::Mat dst; - cv::phase(src1, src2, dst, angleInDegrees); + TEST_CYCLE() cv::phase(src1, src2, dst, angleInDegrees); - TEST_CYCLE() - { - cv::phase(src1, src2, dst, angleInDegrees); - } - - CPU_SANITY_CHECK(dst); + CPU_SANITY_CHECK(dst, 1e-8); } } @@ -1947,15 +1523,10 @@ PERF_TEST_P(Sz_AngleInDegrees, Core_CartToPolar, Combine(GPU_TYPICAL_MAT_SIZES, cv::gpu::GpuMat d_magnitude; cv::gpu::GpuMat d_angle; - cv::gpu::cartToPolar(d_src1, d_src2, d_magnitude, d_angle, angleInDegrees); + TEST_CYCLE() cv::gpu::cartToPolar(d_src1, d_src2, d_magnitude, d_angle, angleInDegrees); - TEST_CYCLE() - { - cv::gpu::cartToPolar(d_src1, d_src2, d_magnitude, d_angle, angleInDegrees); - } - - GPU_SANITY_CHECK(d_magnitude); - GPU_SANITY_CHECK(d_angle); + GPU_SANITY_CHECK(d_magnitude, 1e-8); + GPU_SANITY_CHECK(d_angle, 1e-8); } else @@ -1963,15 +1534,10 @@ PERF_TEST_P(Sz_AngleInDegrees, Core_CartToPolar, Combine(GPU_TYPICAL_MAT_SIZES, cv::Mat magnitude; cv::Mat angle; - cv::cartToPolar(src1, src2, magnitude, angle, angleInDegrees); + TEST_CYCLE() cv::cartToPolar(src1, src2, magnitude, angle, angleInDegrees); - TEST_CYCLE() - { - cv::cartToPolar(src1, src2, magnitude, angle, angleInDegrees); - } - - CPU_SANITY_CHECK(magnitude); - CPU_SANITY_CHECK(angle); + CPU_SANITY_CHECK(magnitude, 1e-8); + CPU_SANITY_CHECK(angle, 1e-8); } } @@ -1996,30 +1562,20 @@ PERF_TEST_P(Sz_AngleInDegrees, Core_PolarToCart, Combine(GPU_TYPICAL_MAT_SIZES, cv::gpu::GpuMat d_x; cv::gpu::GpuMat d_y; - cv::gpu::polarToCart(d_magnitude, d_angle, d_x, d_y, angleInDegrees); + TEST_CYCLE() cv::gpu::polarToCart(d_magnitude, d_angle, d_x, d_y, angleInDegrees); - TEST_CYCLE() - { - cv::gpu::polarToCart(d_magnitude, d_angle, d_x, d_y, angleInDegrees); - } - - GPU_SANITY_CHECK(d_x); - GPU_SANITY_CHECK(d_y); + GPU_SANITY_CHECK(d_x, 1e-8); + GPU_SANITY_CHECK(d_y, 1e-8); } else { cv::Mat x; cv::Mat y; - cv::polarToCart(magnitude, angle, x, y, angleInDegrees); + TEST_CYCLE() cv::polarToCart(magnitude, angle, x, y, angleInDegrees); - TEST_CYCLE() - { - cv::polarToCart(magnitude, angle, x, y, angleInDegrees); - } - - CPU_SANITY_CHECK(x); - CPU_SANITY_CHECK(y); + CPU_SANITY_CHECK(x, 1e-8); + CPU_SANITY_CHECK(y, 1e-8); } } @@ -2041,24 +1597,14 @@ PERF_TEST_P(Sz, Core_MeanStdDev, GPU_TYPICAL_MAT_SIZES) cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_buf; - cv::gpu::meanStdDev(d_src, mean, stddev, d_buf); - - TEST_CYCLE() - { - cv::gpu::meanStdDev(d_src, mean, stddev, d_buf); - } + TEST_CYCLE() cv::gpu::meanStdDev(d_src, mean, stddev, d_buf); } else { - cv::meanStdDev(src, mean, stddev); - - TEST_CYCLE() - { - cv::meanStdDev(src, mean, stddev); - } + TEST_CYCLE() cv::meanStdDev(src, mean, stddev); } - GPU_SANITY_CHECK(stddev); + GPU_SANITY_CHECK(stddev, 1e-6); } ////////////////////////////////////////////////////////////////////// @@ -2085,24 +1631,14 @@ PERF_TEST_P(Sz_Depth_Norm, Core_Norm, Combine( cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_buf; - dst = cv::gpu::norm(d_src, normType, d_buf); - - TEST_CYCLE() - { - dst = cv::gpu::norm(d_src, normType, d_buf); - } + TEST_CYCLE() dst = cv::gpu::norm(d_src, normType, d_buf); } else { - dst = cv::norm(src, normType); - - TEST_CYCLE() - { - dst = cv::norm(src, normType); - } + TEST_CYCLE() dst = cv::norm(src, normType); } - SANITY_CHECK(dst); + SANITY_CHECK(dst, 1e-6); } ////////////////////////////////////////////////////////////////////// @@ -2130,25 +1666,15 @@ PERF_TEST_P(Sz_Norm, Core_NormDiff, Combine( cv::gpu::GpuMat d_src1(src1); cv::gpu::GpuMat d_src2(src2); - dst = cv::gpu::norm(d_src1, d_src2, normType); - - TEST_CYCLE() - { - dst = cv::gpu::norm(d_src1, d_src2, normType); - } + TEST_CYCLE() dst = cv::gpu::norm(d_src1, d_src2, normType); } else { - dst = cv::norm(src1, src2, normType); - - TEST_CYCLE() - { - dst = cv::norm(src1, src2, normType); - } + TEST_CYCLE() dst = cv::norm(src1, src2, normType); } - SANITY_CHECK(dst); + SANITY_CHECK(dst, 1e-6); } ////////////////////////////////////////////////////////////////////// @@ -2175,24 +1701,14 @@ PERF_TEST_P(Sz_Depth_Cn, Core_Sum, Combine( cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_buf; - dst = cv::gpu::sum(d_src, d_buf); - - TEST_CYCLE() - { - dst = cv::gpu::sum(d_src, d_buf); - } + TEST_CYCLE() dst = cv::gpu::sum(d_src, d_buf); } else { - dst = cv::sum(src); - - TEST_CYCLE() - { - dst = cv::sum(src); - } + TEST_CYCLE() dst = cv::sum(src); } - SANITY_CHECK(dst); + SANITY_CHECK(dst, 1e-6); } ////////////////////////////////////////////////////////////////////// @@ -2219,18 +1735,13 @@ PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs, Combine( cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_buf; - dst = cv::gpu::absSum(d_src, d_buf); + TEST_CYCLE() dst = cv::gpu::absSum(d_src, d_buf); - TEST_CYCLE() - { - dst = cv::gpu::absSum(d_src, d_buf); - } - - SANITY_CHECK(dst); + SANITY_CHECK(dst, 1e-6); } else { - FAIL() << "No such CPU implementation analogy"; + FAIL_NO_CPU(); } } @@ -2258,18 +1769,13 @@ PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr, Combine( cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_buf; - dst = cv::gpu::sqrSum(d_src, d_buf); + TEST_CYCLE() dst = cv::gpu::sqrSum(d_src, d_buf); - TEST_CYCLE() - { - dst = cv::gpu::sqrSum(d_src, d_buf); - } - - SANITY_CHECK(dst); + SANITY_CHECK(dst, 1e-6); } else { - FAIL() << "No such CPU implementation analogy"; + FAIL_NO_CPU(); } } @@ -2293,19 +1799,14 @@ PERF_TEST_P(Sz_Depth, Core_MinMax, Combine( cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_buf; - cv::gpu::minMax(d_src, &minVal, &maxVal, cv::gpu::GpuMat(), d_buf); - - TEST_CYCLE() - { - cv::gpu::minMax(d_src, &minVal, &maxVal, cv::gpu::GpuMat(), d_buf); - } + TEST_CYCLE() cv::gpu::minMax(d_src, &minVal, &maxVal, cv::gpu::GpuMat(), d_buf); SANITY_CHECK(minVal); SANITY_CHECK(maxVal); } else { - FAIL() << "No such CPU implementation analogy"; + FAIL_NO_CPU(); } } @@ -2330,25 +1831,15 @@ PERF_TEST_P(Sz_Depth, Core_MinMaxLoc, Combine( cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_valbuf, d_locbuf; - cv::gpu::minMaxLoc(d_src, &minVal, &maxVal, &minLoc, &maxLoc, cv::gpu::GpuMat(), d_valbuf, d_locbuf); - - TEST_CYCLE() - { - cv::gpu::minMaxLoc(d_src, &minVal, &maxVal, &minLoc, &maxLoc, cv::gpu::GpuMat(), d_valbuf, d_locbuf); - } + TEST_CYCLE() cv::gpu::minMaxLoc(d_src, &minVal, &maxVal, &minLoc, &maxLoc, cv::gpu::GpuMat(), d_valbuf, d_locbuf); } else { - cv::minMaxLoc(src, &minVal, &maxVal, &minLoc, &maxLoc); - - TEST_CYCLE() - { - cv::minMaxLoc(src, &minVal, &maxVal, &minLoc, &maxLoc); - } + TEST_CYCLE() cv::minMaxLoc(src, &minVal, &maxVal, &minLoc, &maxLoc); } - SANITY_CHECK(minVal); - SANITY_CHECK(maxVal); + SANITY_CHECK(minVal, 1e-12); + SANITY_CHECK(maxVal, 1e-12); // unsupported by peft system //SANITY_CHECK(minLoc); @@ -2368,28 +1859,18 @@ PERF_TEST_P(Sz_Depth, Core_CountNonZero, Combine( cv::Mat src(size, depth); fillRandom(src); - int dst; + int dst = 0; if (PERF_RUN_GPU()) { cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_buf; - dst = cv::gpu::countNonZero(d_src, d_buf); - - TEST_CYCLE() - { - dst = cv::gpu::countNonZero(d_src, d_buf); - } + TEST_CYCLE() dst = cv::gpu::countNonZero(d_src, d_buf); } else { - dst = cv::countNonZero(src); - - TEST_CYCLE() - { - dst = cv::countNonZero(src); - } + TEST_CYCLE() dst = cv::countNonZero(src); } SANITY_CHECK(dst); @@ -2430,25 +1911,17 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce, Combine( cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_dst; - cv::gpu::reduce(d_src, d_dst, dim, reduceOp); + TEST_CYCLE() cv::gpu::reduce(d_src, d_dst, dim, reduceOp); - TEST_CYCLE() - { - cv::gpu::reduce(d_src, d_dst, dim, reduceOp); - } - - GPU_SANITY_CHECK(d_dst); + GPU_SANITY_CHECK(d_dst, 1); } else { cv::Mat dst; - cv::reduce(src, dst, dim, reduceOp); + TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp); - TEST_CYCLE() - { - cv::reduce(src, dst, dim, reduceOp); - } + CPU_SANITY_CHECK(dst, 1); } } diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index 30377e148f..ee0968442c 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -581,13 +581,12 @@ PERF_TEST_P(Sz, ImgProc_CalcHist, GPU_TYPICAL_MAT_SIZES) { cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_hist; - cv::gpu::GpuMat d_buf; - cv::gpu::calcHist(d_src, d_hist, d_buf); + cv::gpu::calcHist(d_src, d_hist); TEST_CYCLE() { - cv::gpu::calcHist(d_src, d_hist, d_buf); + cv::gpu::calcHist(d_src, d_hist); } GPU_SANITY_CHECK(d_hist); @@ -1706,10 +1705,40 @@ PERF_TEST_P(Sz_Depth_Cn, ImgProc_ImagePyramidGetLayer, Combine(GPU_TYPICAL_MAT_S } } +namespace { + struct Vec4iComparator + { + bool operator()(const cv::Vec4i& a, const cv::Vec4i b) const + { + if (a[0] != b[0]) return a[0] < b[0]; + else if(a[1] != b[1]) return a[1] < b[1]; + else if(a[2] != b[2]) return a[2] < b[2]; + else return a[3] < b[3]; + } + }; + struct Vec3fComparator + { + bool operator()(const cv::Vec3f& a, const cv::Vec3f b) const + { + if(a[0] != b[0]) return a[0] < b[0]; + else if(a[1] != b[1]) return a[1] < b[1]; + else return a[2] < b[2]; + } + }; + struct Vec2fComparator + { + bool operator()(const cv::Vec2f& a, const cv::Vec2f b) const + { + if(a[0] != b[0]) return a[0] < b[0]; + else return a[1] < b[1]; + } + }; +} + ////////////////////////////////////////////////////////////////////// // HoughLines -PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES) +PERF_TEST_P(Sz, ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES) { declare.time(30.0); @@ -1744,7 +1773,11 @@ PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES) cv::gpu::HoughLines(d_src, d_lines, d_buf, rho, theta, threshold); } - GPU_SANITY_CHECK(d_lines); + cv::Mat h_lines(d_lines); + cv::Vec2f* begin = (cv::Vec2f*)(h_lines.ptr(0)); + cv::Vec2f* end = (cv::Vec2f*)(h_lines.ptr(0) + (h_lines.cols) * 2 * sizeof(float)); + std::sort(begin, end, Vec2fComparator()); + SANITY_CHECK(h_lines); } else { @@ -1756,7 +1789,64 @@ PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES) cv::HoughLines(src, lines, rho, theta, threshold); } - CPU_SANITY_CHECK(lines); + std::sort(lines.begin(), lines.end(), Vec2fComparator()); + SANITY_CHECK(lines); + } +} + +////////////////////////////////////////////////////////////////////// +// HoughLinesP + +DEF_PARAM_TEST_1(Image, std::string); + +PERF_TEST_P(Image, ImgProc_HoughLinesP, testing::Values("cv/shared/pic5.png", "stitching/a1.png")) +{ + declare.time(30.0); + + std::string fileName = getDataPath(GetParam()); + + const double rho = 1.0f; + const double theta = CV_PI / 180.0; + const int threshold = 100; + const int minLineLenght = 50; + const int maxLineGap = 5; + + cv::Mat image = cv::imread(fileName, cv::IMREAD_GRAYSCALE); + + cv::Mat mask; + cv::Canny(image, mask, 50, 100); + + if (PERF_RUN_GPU()) + { + cv::gpu::GpuMat d_mask(mask); + cv::gpu::GpuMat d_lines; + cv::gpu::HoughLinesBuf d_buf; + + cv::gpu::HoughLinesP(d_mask, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap); + + TEST_CYCLE() + { + cv::gpu::HoughLinesP(d_mask, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap); + } + + cv::Mat h_lines(d_lines); + cv::Vec4i* begin = h_lines.ptr(); + cv::Vec4i* end = h_lines.ptr() + h_lines.cols; + std::sort(begin, end, Vec4iComparator()); + SANITY_CHECK(h_lines); + } + else + { + std::vector lines; + cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap); + + TEST_CYCLE() + { + cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap); + } + + std::sort(lines.begin(), lines.end(), Vec4iComparator()); + SANITY_CHECK(lines); } } @@ -1804,7 +1894,11 @@ PERF_TEST_P(Sz_Dp_MinDist, ImgProc_HoughCircles, Combine(GPU_TYPICAL_MAT_SIZES, cv::gpu::HoughCircles(d_src, d_circles, d_buf, CV_HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius); } - GPU_SANITY_CHECK(d_circles); + cv::Mat h_circles(d_circles); + cv::Vec3f* begin = (cv::Vec3f*)(h_circles.ptr(0)); + cv::Vec3f* end = (cv::Vec3f*)(h_circles.ptr(0) + (h_circles.cols) * 3 * sizeof(float)); + std::sort(begin, end, Vec3fComparator()); + SANITY_CHECK(h_circles); } else { @@ -1817,7 +1911,8 @@ PERF_TEST_P(Sz_Dp_MinDist, ImgProc_HoughCircles, Combine(GPU_TYPICAL_MAT_SIZES, cv::HoughCircles(src, circles, CV_HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius); } - CPU_SANITY_CHECK(circles); + std::sort(circles.begin(), circles.end(), Vec3fComparator()); + SANITY_CHECK(circles); } } diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index 6b864a3e52..6d040ac02f 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -89,7 +89,6 @@ PERF_TEST_P(HOG, CalTech, Values("gpu/caltech/image_00000009_0.png", "gp SANITY_CHECK(found_locations); } - /////////////////////////////////////////////////////////////// // HaarClassifier @@ -181,4 +180,4 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier, } } -} // namespace +} // namespace \ No newline at end of file diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp new file mode 100644 index 0000000000..e9437d70f9 --- /dev/null +++ b/modules/gpu/perf/perf_softcascade.cpp @@ -0,0 +1,279 @@ +#include "perf_precomp.hpp" + +#define GPU_PERF_TEST_P(fixture, name, params) \ + class fixture##_##name : public fixture {\ + public:\ + fixture##_##name() {}\ + protected:\ + virtual void __cpu();\ + virtual void __gpu();\ + virtual void PerfTestBody();\ + };\ + TEST_P(fixture##_##name, name /*perf*/){ RunPerfTestBody(); }\ + INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);\ + void fixture##_##name::PerfTestBody() { if (PERF_RUN_GPU()) __gpu(); else __cpu(); } + +#define RUN_CPU(fixture, name)\ + void fixture##_##name::__cpu() + +#define RUN_GPU(fixture, name)\ + void fixture##_##name::__gpu() + +#define NO_CPU(fixture, name)\ +void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";} + +namespace { + struct DetectionLess + { + bool operator()(const cv::gpu::SCascade::Detection& a, + const cv::gpu::SCascade::Detection& b) const + { + if (a.x != b.x) return a.x < b.x; + else if (a.y != b.y) return a.y < b.y; + else if (a.w != b.w) return a.w < b.w; + else return a.h < b.h; + } + }; + + cv::Mat sortDetections(cv::gpu::GpuMat& objects) + { + cv::Mat detections(objects); + + typedef cv::gpu::SCascade::Detection Detection; + Detection* begin = (Detection*)(detections.ptr(0)); + Detection* end = (Detection*)(detections.ptr(0) + detections.cols); + std::sort(begin, end, DetectionLess()); + + return detections; + } +} + + +typedef std::tr1::tuple fixture_t; +typedef perf::TestBaseWithParam SCascadeTest; + +GPU_PERF_TEST_P(SCascadeTest, detect, + testing::Combine( + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")))) + +RUN_GPU(SCascadeTest, detect) +{ + cv::Mat cpu = readImage (GET_PARAM(1)); + ASSERT_FALSE(cpu.empty()); + cv::gpu::GpuMat colored(cpu); + + cv::gpu::SCascade cascade; + + cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); + + cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1); + rois.setTo(1); + + cascade.detect(colored, rois, objectBoxes); + + TEST_CYCLE() + { + cascade.detect(colored, rois, objectBoxes); + } + + SANITY_CHECK(sortDetections(objectBoxes)); +} + +NO_CPU(SCascadeTest, detect) + +static cv::Rect getFromTable(int idx) +{ + static const cv::Rect rois[] = + { + cv::Rect( 65 * 4, 20 * 4, 35 * 4, 80 * 4), + cv::Rect( 95 * 4, 35 * 4, 45 * 4, 40 * 4), + cv::Rect( 45 * 4, 35 * 4, 45 * 4, 40 * 4), + cv::Rect( 25 * 4, 27 * 4, 50 * 4, 45 * 4), + cv::Rect(100 * 4, 50 * 4, 45 * 4, 40 * 4), + + cv::Rect( 60 * 4, 30 * 4, 45 * 4, 40 * 4), + cv::Rect( 40 * 4, 55 * 4, 50 * 4, 40 * 4), + cv::Rect( 48 * 4, 37 * 4, 72 * 4, 80 * 4), + cv::Rect( 48 * 4, 32 * 4, 85 * 4, 58 * 4), + cv::Rect( 48 * 4, 0 * 4, 32 * 4, 27 * 4) + }; + + return rois[idx]; +} + +typedef std::tr1::tuple roi_fixture_t; +typedef perf::TestBaseWithParam SCascadeTestRoi; + +GPU_PERF_TEST_P(SCascadeTestRoi, detectInRoi, + testing::Combine( + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")), + testing::Range(0, 5))) + +RUN_GPU(SCascadeTestRoi, detectInRoi) +{ + cv::Mat cpu = readImage (GET_PARAM(1)); + ASSERT_FALSE(cpu.empty()); + cv::gpu::GpuMat colored(cpu); + + cv::gpu::SCascade cascade; + + cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); + + cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1); + rois.setTo(0); + + int nroi = GET_PARAM(2); + cv::RNG rng; + for (int i = 0; i < nroi; ++i) + { + cv::Rect r = getFromTable(rng(10)); + cv::gpu::GpuMat sub(rois, r); + sub.setTo(1); + } + + cascade.detect(colored, rois, objectBoxes); + + TEST_CYCLE() + { + cascade.detect(colored, rois, objectBoxes); + } + + SANITY_CHECK(sortDetections(objectBoxes)); +} + +NO_CPU(SCascadeTestRoi, detectInRoi) + + +GPU_PERF_TEST_P(SCascadeTestRoi, detectEachRoi, + testing::Combine( + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")), + testing::Range(0, 10))) + +RUN_GPU(SCascadeTestRoi, detectEachRoi) +{ + cv::Mat cpu = readImage (GET_PARAM(1)); + ASSERT_FALSE(cpu.empty()); + cv::gpu::GpuMat colored(cpu); + + cv::gpu::SCascade cascade; + + cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); + + cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1); + rois.setTo(0); + + int idx = GET_PARAM(2); + cv::Rect r = getFromTable(idx); + cv::gpu::GpuMat sub(rois, r); + sub.setTo(1); + + cascade.detect(colored, rois, objectBoxes); + + TEST_CYCLE() + { + cascade.detect(colored, rois, objectBoxes); + } + + SANITY_CHECK(sortDetections(objectBoxes)); +} + +NO_CPU(SCascadeTestRoi, detectEachRoi) + +GPU_PERF_TEST_P(SCascadeTest, detectOnIntegral, + testing::Combine( + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/integrals.xml")))) + +static std::string itoa(long i) +{ + static char s[65]; + sprintf(s, "%ld", i); + return std::string(s); +} + +RUN_GPU(SCascadeTest, detectOnIntegral) +{ + cv::FileStorage fsi(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ); + ASSERT_TRUE(fsi.isOpened()); + + cv::gpu::GpuMat hogluv(121 * 10, 161, CV_32SC1); + for (int i = 0; i < 10; ++i) + { + cv::Mat channel; + fsi[std::string("channel") + itoa(i)] >> channel; + cv::gpu::GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121)); + gchannel.upload(channel); + } + + cv::gpu::SCascade cascade; + + cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); + + cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(cv::Size(640, 480), CV_8UC1); + rois.setTo(1); + + cascade.detect(hogluv, rois, objectBoxes); + + TEST_CYCLE() + { + cascade.detect(hogluv, rois, objectBoxes); + } + + SANITY_CHECK(sortDetections(objectBoxes)); +} + +NO_CPU(SCascadeTest, detectOnIntegral) + +GPU_PERF_TEST_P(SCascadeTest, detectStream, + testing::Combine( + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")))) + +RUN_GPU(SCascadeTest, detectStream) +{ + cv::Mat cpu = readImage (GET_PARAM(1)); + ASSERT_FALSE(cpu.empty()); + cv::gpu::GpuMat colored(cpu); + + cv::gpu::SCascade cascade; + + cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); + + cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1); + rois.setTo(1); + + cv::gpu::Stream s; + + cascade.detect(colored, rois, objectBoxes, s); + + TEST_CYCLE() + { + cascade.detect(colored, rois, objectBoxes, s); + } + +#ifdef HAVE_CUDA + cudaDeviceSynchronize(); +#endif + + SANITY_CHECK(sortDetections(objectBoxes)); +} + +NO_CPU(SCascadeTest, detectStream) diff --git a/modules/gpu/perf/perf_video.cpp b/modules/gpu/perf/perf_video.cpp index b18cb17dfb..bf2fd99c6e 100644 --- a/modules/gpu/perf/perf_video.cpp +++ b/modules/gpu/perf/perf_video.cpp @@ -394,6 +394,173 @@ PERF_TEST_P(ImagePair, Video_FarnebackOpticalFlow, } } +////////////////////////////////////////////////////// +// OpticalFlowDual_TVL1 + +PERF_TEST_P(ImagePair, Video_OpticalFlowDual_TVL1, + Values(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png"))) +{ + declare.time(20); + + cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(frame0.empty()); + + cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(frame1.empty()); + + if (PERF_RUN_GPU()) + { + cv::gpu::GpuMat d_frame0(frame0); + cv::gpu::GpuMat d_frame1(frame1); + cv::gpu::GpuMat d_flowx; + cv::gpu::GpuMat d_flowy; + + cv::gpu::OpticalFlowDual_TVL1_GPU d_alg; + + d_alg(d_frame0, d_frame1, d_flowx, d_flowy); + + TEST_CYCLE() + { + d_alg(d_frame0, d_frame1, d_flowx, d_flowy); + } + + GPU_SANITY_CHECK(d_flowx); + GPU_SANITY_CHECK(d_flowy); + } + else + { + cv::Mat flow; + + cv::OpticalFlowDual_TVL1 alg; + + alg(frame0, frame1, flow); + + TEST_CYCLE() + { + alg(frame0, frame1, flow); + } + + CPU_SANITY_CHECK(flow); + } +} + +////////////////////////////////////////////////////// +// OpticalFlowBM + +void calcOpticalFlowBM(const cv::Mat& prev, const cv::Mat& curr, + cv::Size bSize, cv::Size shiftSize, cv::Size maxRange, int usePrevious, + cv::Mat& velx, cv::Mat& vely) +{ + cv::Size sz((curr.cols - bSize.width + shiftSize.width)/shiftSize.width, (curr.rows - bSize.height + shiftSize.height)/shiftSize.height); + + velx.create(sz, CV_32FC1); + vely.create(sz, CV_32FC1); + + CvMat cvprev = prev; + CvMat cvcurr = curr; + + CvMat cvvelx = velx; + CvMat cvvely = vely; + + cvCalcOpticalFlowBM(&cvprev, &cvcurr, bSize, shiftSize, maxRange, usePrevious, &cvvelx, &cvvely); +} + +PERF_TEST_P(ImagePair, Video_OpticalFlowBM, + Values(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png"))) +{ + declare.time(400); + + cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(frame0.empty()); + + cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(frame1.empty()); + + cv::Size block_size(16, 16); + cv::Size shift_size(1, 1); + cv::Size max_range(16, 16); + + if (PERF_RUN_GPU()) + { + cv::gpu::GpuMat d_frame0(frame0); + cv::gpu::GpuMat d_frame1(frame1); + cv::gpu::GpuMat d_velx, d_vely, buf; + + cv::gpu::calcOpticalFlowBM(d_frame0, d_frame1, block_size, shift_size, max_range, false, d_velx, d_vely, buf); + + TEST_CYCLE() + { + cv::gpu::calcOpticalFlowBM(d_frame0, d_frame1, block_size, shift_size, max_range, false, d_velx, d_vely, buf); + } + + GPU_SANITY_CHECK(d_velx); + GPU_SANITY_CHECK(d_vely); + } + else + { + cv::Mat velx, vely; + + calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely); + + TEST_CYCLE() + { + calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely); + } + + CPU_SANITY_CHECK(velx); + CPU_SANITY_CHECK(vely); + } +} + +PERF_TEST_P(ImagePair, Video_FastOpticalFlowBM, + Values(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png"))) +{ + declare.time(400); + + cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(frame0.empty()); + + cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(frame1.empty()); + + cv::Size block_size(16, 16); + cv::Size shift_size(1, 1); + cv::Size max_range(16, 16); + + if (PERF_RUN_GPU()) + { + cv::gpu::GpuMat d_frame0(frame0); + cv::gpu::GpuMat d_frame1(frame1); + cv::gpu::GpuMat d_velx, d_vely; + + cv::gpu::FastOpticalFlowBM fastBM; + + fastBM(d_frame0, d_frame1, d_velx, d_vely, max_range.width, block_size.width); + + TEST_CYCLE() + { + fastBM(d_frame0, d_frame1, d_velx, d_vely, max_range.width, block_size.width); + } + + GPU_SANITY_CHECK(d_velx); + GPU_SANITY_CHECK(d_vely); + } + else + { + cv::Mat velx, vely; + + calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely); + + TEST_CYCLE() + { + calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely); + } + + CPU_SANITY_CHECK(velx); + CPU_SANITY_CHECK(vely); + } +} + ////////////////////////////////////////////////////// // FGDStatModel diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 1a10bc32eb..242febded9 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -68,11 +68,16 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream) { #ifndef HAVE_CUBLAS - (void)src1; (void)src2; (void)alpha; (void)src3; (void)beta; (void)dst; (void)flags; (void)stream; + (void)src1; + (void)src2; + (void)alpha; + (void)src3; + (void)beta; + (void)dst; + (void)flags; + (void)stream; CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS"); - #else - // CUBLAS works with column-major matrices CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2); @@ -80,7 +85,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G if (src1.depth() == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } @@ -188,7 +193,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G } cublasSafeCall( cublasDestroy_v2(handle) ); - #endif } @@ -227,7 +231,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s) } else // if (src.elemSize() == 8) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); NppStStreamHandler h(stream); diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index a04639715d..095a64adb4 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -88,71 +88,71 @@ namespace cv { namespace gpu { namespace device { template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); } namespace bf_knnmatch { template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream); + cudaStream_t stream); template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); } namespace bf_radius_match { template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); } }}} @@ -202,7 +202,7 @@ void cv::gpu::BFMatcher_GPU::matchSingle(const GpuMat& query, const GpuMat& trai typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); static const caller_t callersL1[] = { @@ -238,10 +238,7 @@ void cv::gpu::BFMatcher_GPU::matchSingle(const GpuMat& query, const GpuMat& trai caller_t func = callers[query.depth()]; CV_Assert(func != 0); - DeviceInfo info; - int cc = info.majorVersion() * 10 + info.minorVersion(); - - func(query, train, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream)); + func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream)); } void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector& matches) @@ -348,7 +345,7 @@ void cv::gpu::BFMatcher_GPU::matchCollection(const GpuMat& query, const GpuMat& typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); static const caller_t callersL1[] = { @@ -383,10 +380,7 @@ void cv::gpu::BFMatcher_GPU::matchCollection(const GpuMat& query, const GpuMat& caller_t func = callers[query.depth()]; CV_Assert(func != 0); - DeviceInfo info; - int cc = info.majorVersion() * 10 + info.minorVersion(); - - func(query, trainCollection, masks, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream)); + func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream)); } void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector& matches) @@ -462,7 +456,7 @@ void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& t typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream); + cudaStream_t stream); static const caller_t callersL1[] = { @@ -512,10 +506,7 @@ void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& t caller_t func = callers[query.depth()]; CV_Assert(func != 0); - DeviceInfo info; - int cc = info.majorVersion() * 10 + info.minorVersion(); - - func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream)); + func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream)); } void cv::gpu::BFMatcher_GPU::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, @@ -594,7 +585,7 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuM typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); static const caller_t callersL1[] = { @@ -634,10 +625,7 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuM caller_t func = callers[query.depth()]; CV_Assert(func != 0); - DeviceInfo info; - int cc = info.majorVersion() * 10 + info.minorVersion(); - - func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream)); + func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream)); } void cv::gpu::BFMatcher_GPU::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, @@ -778,7 +766,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); static const caller_t callersL1[] = { @@ -799,12 +787,6 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat matchHamming_gpu, 0/*matchHamming_gpu*/ }; - DeviceInfo info; - int cc = info.majorVersion() * 10 + info.minorVersion(); - - if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS)) - CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics"); - const int nQuery = query.rows; const int nTrain = train.rows; @@ -830,7 +812,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat caller_t func = callers[query.depth()]; CV_Assert(func != 0); - func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); + func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream)); } void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches, @@ -913,7 +895,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat& typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); static const caller_t callersL1[] = { @@ -934,12 +916,6 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat& matchHamming_gpu, 0/*matchHamming_gpu*/ }; - DeviceInfo info; - int cc = info.majorVersion() * 10 + info.minorVersion(); - - if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS)) - CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics"); - const int nQuery = query.rows; CV_Assert(query.channels() == 1 && query.depth() < CV_64F); @@ -968,7 +944,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat& vector masks_(masks.begin(), masks.end()); func(query, &trains_[0], static_cast(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0], - trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); + trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream)); } void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches, diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index 07e174e5cf..3603933979 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -623,7 +623,7 @@ private: } // copy data structures on gpu - stage_mat.upload(cv::Mat(1, stages.size() * sizeof(Stage), CV_8UC1, (uchar*)&(stages[0]) )); + stage_mat.upload(cv::Mat(1, (int) (stages.size() * sizeof(Stage)), CV_8UC1, (uchar*)&(stages[0]) )); trees_mat.upload(cv::Mat(cl_trees).reshape(1,1)); nodes_mat.upload(cv::Mat(cl_nodes).reshape(1,1)); leaves_mat.upload(cv::Mat(cl_leaves).reshape(1,1)); diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu index 6a778735b8..49bc1dfcd2 100644 --- a/modules/gpu/src/cuda/bf_knnmatch.cu +++ b/modules/gpu/src/cuda/bf_knnmatch.cu @@ -42,10 +42,13 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/utility.hpp" +#include "opencv2/gpu/device/reduce.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" +#include "opencv2/gpu/device/warp_shuffle.hpp" namespace cv { namespace gpu { namespace device { @@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device int& bestTrainIdx1, int& bestTrainIdx2, float* s_distance, int* s_trainIdx) { + #if __CUDA_ARCH__ >= 300 + (void) s_distance; + (void) s_trainIdx; + + float d1, d2; + int i1, i2; + + #pragma unroll + for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2) + { + d1 = shfl_down(bestDistance1, i, BLOCK_SIZE); + d2 = shfl_down(bestDistance2, i, BLOCK_SIZE); + i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE); + i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE); + + if (bestDistance1 < d1) + { + if (d1 < bestDistance2) + { + bestDistance2 = d1; + bestTrainIdx2 = i1; + } + } + else + { + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestDistance1 = d1; + bestTrainIdx1 = i1; + + if (d2 < bestDistance2) + { + bestDistance2 = d2; + bestTrainIdx2 = i2; + } + } + } + #else float myBestDistance1 = numeric_limits::max(); float myBestDistance2 = numeric_limits::max(); int myBestTrainIdx1 = -1; @@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device bestTrainIdx1 = myBestTrainIdx1; bestTrainIdx2 = myBestTrainIdx2; + #endif } template @@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device int& bestImgIdx1, int& bestImgIdx2, float* s_distance, int* s_trainIdx, int* s_imgIdx) { + #if __CUDA_ARCH__ >= 300 + (void) s_distance; + (void) s_trainIdx; + (void) s_imgIdx; + + float d1, d2; + int i1, i2; + int j1, j2; + + #pragma unroll + for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2) + { + d1 = shfl_down(bestDistance1, i, BLOCK_SIZE); + d2 = shfl_down(bestDistance2, i, BLOCK_SIZE); + i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE); + i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE); + j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE); + j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE); + + if (bestDistance1 < d1) + { + if (d1 < bestDistance2) + { + bestDistance2 = d1; + bestTrainIdx2 = i1; + bestImgIdx2 = j1; + } + } + else + { + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + bestImgIdx2 = bestImgIdx1; + + bestDistance1 = d1; + bestTrainIdx1 = i1; + bestImgIdx1 = j1; + + if (d2 < bestDistance2) + { + bestDistance2 = d2; + bestTrainIdx2 = i2; + bestImgIdx2 = j2; + } + } + } + #else float myBestDistance1 = numeric_limits::max(); float myBestDistance2 = numeric_limits::max(); int myBestTrainIdx1 = -1; @@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device bestImgIdx1 = myBestImgIdx1; bestImgIdx2 = myBestImgIdx2; + #endif } /////////////////////////////////////////////////////////////////////////////// @@ -748,9 +839,8 @@ namespace cv { namespace gpu { namespace device template void match2Dispatcher(const PtrStepSz& query, const PtrStepSz& train, const Mask& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { - (void)cc; if (query.cols <= 64) { matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz >(trainIdx), static_cast< PtrStepSz > (distance), stream); @@ -780,9 +870,8 @@ namespace cv { namespace gpu { namespace device template void match2Dispatcher(const PtrStepSz& query, const PtrStepSz* trains, int n, const Mask& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { - (void)cc; if (query.cols <= 64) { matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz >(trainIdx), static_cast< PtrStepSz >(imgIdx), static_cast< PtrStepSz > (distance), stream); @@ -945,9 +1034,8 @@ namespace cv { namespace gpu { namespace device template void calcDistanceDispatcher(const PtrStepSz& query, const PtrStepSz& train, const Mask& mask, const PtrStepSzf& allDist, - int cc, cudaStream_t stream) + cudaStream_t stream) { - (void)cc; if (query.cols <= 64) { calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream); @@ -1005,7 +1093,7 @@ namespace cv { namespace gpu { namespace device s_trainIdx[threadIdx.x] = bestIdx; __syncthreads(); - reducePredVal(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less()); + reduceKeyVal(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less()); if (threadIdx.x == 0) { @@ -1034,7 +1122,7 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaDeviceSynchronize() ); } - void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream) + void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream) { findKnnMatch<256>(k, static_cast(trainIdx), static_cast(distance), allDist, stream); } @@ -1045,16 +1133,16 @@ namespace cv { namespace gpu { namespace device template void matchDispatcher(const PtrStepSz& query, const PtrStepSz& train, int k, const Mask& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (k == 2) { - match2Dispatcher(query, train, mask, trainIdx, distance, cc, stream); + match2Dispatcher(query, train, mask, trainIdx, distance, stream); } else { - calcDistanceDispatcher(query, train, mask, allDist, cc, stream); - findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream); + calcDistanceDispatcher(query, train, mask, allDist, stream); + findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream); } } @@ -1063,105 +1151,105 @@ namespace cv { namespace gpu { namespace device template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) - matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream); else - matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); + matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, WithOutMask(), trainIdx, distance, allDist, stream); } - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - //template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + //template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) - matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream); else - matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); + matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, WithOutMask(), trainIdx, distance, allDist, stream); } - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) - matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream); else - matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); + matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, WithOutMask(), trainIdx, distance, allDist, stream); } - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (masks.data) - match2Dispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + match2Dispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream); else - match2Dispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + match2Dispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream); } - template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - //template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); + template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + //template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (masks.data) - match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream); else - match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream); } - //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); + //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (masks.data) - match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream); else - match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream); } - template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - //template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - //template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); + template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + //template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + //template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); } // namespace bf_knnmatch }}} // namespace cv { namespace gpu { namespace device { -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu index f50089ed94..5e64e31bd9 100644 --- a/modules/gpu/src/cuda/bf_match.cu +++ b/modules/gpu/src/cuda/bf_match.cu @@ -42,7 +42,9 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/utility.hpp" +#include "opencv2/gpu/device/reduce.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" @@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device s_distance += threadIdx.y * BLOCK_SIZE; s_trainIdx += threadIdx.y * BLOCK_SIZE; - s_distance[threadIdx.x] = bestDistance; - s_trainIdx[threadIdx.x] = bestTrainIdx; - - __syncthreads(); - - reducePredVal(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less()); + reduceKeyVal(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less()); } template @@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device s_trainIdx += threadIdx.y * BLOCK_SIZE; s_imgIdx += threadIdx.y * BLOCK_SIZE; - s_distance[threadIdx.x] = bestDistance; - s_trainIdx[threadIdx.x] = bestTrainIdx; - s_imgIdx [threadIdx.x] = bestImgIdx; - - __syncthreads(); - - reducePredVal2(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less()); + reduceKeyVal(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less()); } /////////////////////////////////////////////////////////////////////////////// @@ -567,9 +558,8 @@ namespace cv { namespace gpu { namespace device template void matchDispatcher(const PtrStepSz& query, const PtrStepSz& train, const Mask& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { - (void)cc; if (query.cols <= 64) { matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream); @@ -599,9 +589,8 @@ namespace cv { namespace gpu { namespace device template void matchDispatcher(const PtrStepSz& query, const PtrStepSz* trains, int n, const Mask& mask, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { - (void)cc; if (query.cols <= 64) { matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); @@ -633,153 +622,153 @@ namespace cv { namespace gpu { namespace device template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), SingleMask(mask), trainIdx, distance, - cc, stream); + stream); } else { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), WithOutMask(), trainIdx, distance, - cc, stream); + stream); } } - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), SingleMask(mask), trainIdx, distance, - cc, stream); + stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), WithOutMask(), trainIdx, distance, - cc, stream); + stream); } } - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), SingleMask(mask), trainIdx, distance, - cc, stream); + stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), WithOutMask(), trainIdx, distance, - cc, stream); + stream); } } - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (masks.data) { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, - cc, stream); + stream); } else { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, - cc, stream); + stream); } } - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (masks.data) { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, - cc, stream); + stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, - cc, stream); + stream); } } - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (masks.data) { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, - cc, stream); + stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, - cc, stream); + stream); } } - template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); } // namespace bf_match }}} // namespace cv { namespace gpu { namespace device { -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu index 934b8fe84c..19ee94e331 100644 --- a/modules/gpu/src/cuda/bf_radius_match.cu +++ b/modules/gpu/src/cuda/bf_radius_match.cu @@ -42,7 +42,8 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" @@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device __global__ void matchUnrolled(const PtrStepSz query, int imgIdx, const PtrStepSz train, float maxDistance, const Mask mask, PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) - extern __shared__ int smem[]; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; @@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device bestDistance.ptr(queryIdx)[ind] = distVal; } } - - #endif } template @@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device __global__ void match(const PtrStepSz query, int imgIdx, const PtrStepSz train, float maxDistance, const Mask mask, PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) - extern __shared__ int smem[]; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; @@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device bestDistance.ptr(queryIdx)[ind] = distVal; } } - - #endif } template @@ -281,9 +274,8 @@ namespace cv { namespace gpu { namespace device template void matchDispatcher(const PtrStepSz& query, const PtrStepSz& train, float maxDistance, const Mask& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { - (void)cc; if (query.cols <= 64) { matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); @@ -313,9 +305,8 @@ namespace cv { namespace gpu { namespace device template void matchDispatcher(const PtrStepSz& query, const PtrStepSz* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { - (void)cc; if (query.cols <= 64) { matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); @@ -347,126 +338,126 @@ namespace cv { namespace gpu { namespace device template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), maxDistance, SingleMask(mask), trainIdx, distance, nMatches, - cc, stream); + stream); } else { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), maxDistance, WithOutMask(), trainIdx, distance, nMatches, - cc, stream); + stream); } } - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), maxDistance, SingleMask(mask), trainIdx, distance, nMatches, - cc, stream); + stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), maxDistance, WithOutMask(), trainIdx, distance, nMatches, - cc, stream); + stream); } } - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), maxDistance, SingleMask(mask), trainIdx, distance, nMatches, - cc, stream); + stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), maxDistance, WithOutMask(), trainIdx, distance, nMatches, - cc, stream); + stream); } } - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, - cc, stream); + stream); } - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, - cc, stream); + stream); } - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, - cc, stream); + stream); } - template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); } // namespace bf_radius_match }}} // namespace cv { namespace gpu { namespace device -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu index 40c847547e..0fd482c41a 100644 --- a/modules/gpu/src/cuda/calib3d.cu +++ b/modules/gpu/src/cuda/calib3d.cu @@ -42,9 +42,10 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/reduce.hpp" namespace cv { namespace gpu { namespace device { @@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); } + __device__ __forceinline__ TransformOp() {} + __device__ __forceinline__ TransformOp(const TransformOp&) {} }; void call(const PtrStepSz src, const float* rot, @@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z, (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z); } + __device__ __forceinline__ ProjectOp() {} + __device__ __forceinline__ ProjectOp(const ProjectOp&) {} }; void call(const PtrStepSz src, const float* rot, @@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device return x * x; } + template __global__ void computeHypothesisScoresKernel( const int num_points, const float3* object, const float2* image, const float dist_threshold, int* g_num_inliers) @@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device ++num_inliers; } - extern __shared__ float s_num_inliers[]; - s_num_inliers[threadIdx.x] = num_inliers; - __syncthreads(); - - for (int step = blockDim.x / 2; step > 0; step >>= 1) - { - if (threadIdx.x < step) - s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step]; - __syncthreads(); - } + __shared__ int s_num_inliers[BLOCK_SIZE]; + reduce(s_num_inliers, num_inliers, threadIdx.x, plus()); if (threadIdx.x == 0) - g_num_inliers[blockIdx.x] = s_num_inliers[0]; + g_num_inliers[blockIdx.x] = num_inliers; } void computeHypothesisScores( @@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device dim3 threads(256); dim3 grid(num_hypotheses); - int smem_size = threads.x * sizeof(float); - computeHypothesisScoresKernel<<>>( + computeHypothesisScoresKernel<256><<>>( num_points, object, image, dist_threshold, hypothesis_scores); cudaSafeCall( cudaGetLastError() ); @@ -193,4 +190,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu index 3dc0486783..0a5daebaaf 100644 --- a/modules/gpu/src/cuda/canny.cu +++ b/modules/gpu/src/cuda/canny.cu @@ -43,459 +43,451 @@ #if !defined CUDA_DISABLER #include -#include -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/emulation.hpp" +#include "opencv2/gpu/device/transform.hpp" +#include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/utility.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace canny +{ + struct L1 : binary_function + { + __device__ __forceinline__ float operator ()(int x, int y) const + { + return ::abs(x) + ::abs(y); + } + + __device__ __forceinline__ L1() {} + __device__ __forceinline__ L1(const L1&) {} + }; + struct L2 : binary_function + { + __device__ __forceinline__ float operator ()(int x, int y) const + { + return ::sqrtf(x * x + y * y); + } + + __device__ __forceinline__ L2() {} + __device__ __forceinline__ L2(const L2&) {} + }; +} namespace cv { namespace gpu { namespace device { - namespace canny + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits { - __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_shift = 4 }; + }; +}}} + +namespace canny +{ + texture tex_src(false, cudaFilterModePoint, cudaAddressModeClamp); + struct SrcTex + { + const int xoff; + const int yoff; + __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {} + + __device__ __forceinline__ int operator ()(int y, int x) const { - __shared__ int smem[16][18]; + return tex2D(tex_src, x + xoff, y + yoff); + } + }; - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; + template __global__ + void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; - if (i < rows) - { - smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j]; - if (threadIdx.x == 0) - { - smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)]; - smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)]; - } - __syncthreads(); + if (y >= mag.rows || x >= mag.cols) + return; - if (j < cols) - { - dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2]; - dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2]; - } - } + int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1)); + int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1)); + + dx(y, x) = dxVal; + dy(y, x) = dyVal; + + mag(y, x) = norm(dxVal, dyVal); + } + + void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + { + const dim3 block(16, 16); + const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); + + bindTexture(&tex_src, srcWhole); + SrcTex src(xoff, yoff); + + if (L2Grad) + { + L2 norm; + calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); + } + else + { + L1 norm; + calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); } - void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall(cudaThreadSynchronize()); + } + + void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + { + if (L2Grad) { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - - calcSobelRowPass<<>>(src, dx_buf, dy_buf, rows, cols); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); + L2 norm; + transform(dx, dy, mag, norm, WithOutMask(), 0); } - - struct L1 + else { - static __device__ __forceinline__ float calc(int x, int y) - { - return ::abs(x) + ::abs(y); - } - }; - struct L2 - { - static __device__ __forceinline__ float calc(int x, int y) - { - return ::sqrtf(x * x + y * y); - } - }; - - template __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, - PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) - { - __shared__ int sdx[18][16]; - __shared__ int sdy[18][16]; - - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; - - if (j < cols) - { - sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j]; - sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j]; - if (threadIdx.y == 0) - { - sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j]; - sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j]; - - sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j]; - sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j]; - } - __syncthreads(); - - if (i < rows) - { - int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x]; - int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x]; - - dx.ptr(i)[j] = x; - dy.ptr(i)[j] = y; - - mag.ptr(i + 1)[j + 1] = Norm::calc(x, y); - } - } + L1 norm; + transform(dx, dy, mag, norm, WithOutMask(), 0); } + } +} - void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) +////////////////////////////////////////////////////////////////////////////////////////// + +namespace canny +{ + texture tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp); + + __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh) + { + const int CANNY_SHIFT = 15; + const int TG22 = (int)(0.4142135623730950488016887242097*(1<= dx.cols - 1 || y == 0 || y >= dx.rows - 1) + return; + + int dxVal = dx(y, x); + int dyVal = dy(y, x); + + const int s = (dxVal ^ dyVal) < 0 ? -1 : 1; + const float m = tex2D(tex_mag, x, y); + + dxVal = ::abs(dxVal); + dyVal = ::abs(dyVal); + + // 0 - the pixel can not belong to an edge + // 1 - the pixel might belong to an edge + // 2 - the pixel does belong to an edge + int edge_type = 0; + + if (m > low_thresh) { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + const int tg22x = dxVal * TG22; + const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT); - if (L2Grad) - calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); + dyVal <<= CANNY_SHIFT; + + if (dyVal < tg22x) + { + if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y)) + edge_type = 1 + (int)(m > high_thresh); + } + else if(dyVal > tg67x) + { + if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1)) + edge_type = 1 + (int)(m > high_thresh); + } else - calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); - - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall(cudaThreadSynchronize()); + { + if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1)) + edge_type = 1 + (int)(m > high_thresh); + } } - template __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) - { - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; + map(y, x) = edge_type; + } - if (i < rows && j < cols) - mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]); + void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh) + { + const dim3 block(16, 16); + const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); + + bindTexture(&tex_mag, mag); + + calcMapKernel<<>>(dx, dy, map, low_thresh, high_thresh); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +////////////////////////////////////////////////////////////////////////////////////////// + +namespace canny +{ + __device__ int counter = 0; + + __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st) + { + __shared__ volatile int smem[18][18]; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0; + if (threadIdx.y == 0) + smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0; + if (threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0; + if (threadIdx.x == 0) + smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1) + smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0; + if (threadIdx.x == 0 && threadIdx.y == 0) + smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0) + smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0; + if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0; + + __syncthreads(); + + if (x >= map.cols || y >= map.rows) + return; + + int n; + + #pragma unroll + for (int k = 0; k < 16; ++k) + { + n = 0; + + if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) + { + n += smem[threadIdx.y ][threadIdx.x ] == 2; + n += smem[threadIdx.y ][threadIdx.x + 1] == 2; + n += smem[threadIdx.y ][threadIdx.x + 2] == 2; + + n += smem[threadIdx.y + 1][threadIdx.x ] == 2; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; + + n += smem[threadIdx.y + 2][threadIdx.x ] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; + } + + if (n > 0) + smem[threadIdx.y + 1][threadIdx.x + 1] = 2; } - void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) + const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; + + map(y, x) = e; + + n = 0; + + if (e == 2) { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + n += smem[threadIdx.y ][threadIdx.x ] == 1; + n += smem[threadIdx.y ][threadIdx.x + 1] == 1; + n += smem[threadIdx.y ][threadIdx.x + 2] == 1; - if (L2Grad) - calcMagnitude<<>>(dx, dy, mag, rows, cols); - else - calcMagnitude<<>>(dx, dy, mag, rows, cols); + n += smem[threadIdx.y + 1][threadIdx.x ] == 1; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); + n += smem[threadIdx.y + 2][threadIdx.x ] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; } - ////////////////////////////////////////////////////////////////////////////////////////// - - #define CANNY_SHIFT 15 - #define TG22 (int)(0.4142135623730950488016887242097*(1< 0) { - __shared__ float smem[18][18]; + const int ind = ::atomicAdd(&counter, 1); + st[ind] = make_ushort2(x, y); + } + } - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; + void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1) + { + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - const int tid = threadIdx.y * 16 + threadIdx.x; - const int lx = tid % 18; - const int ly = tid / 18; + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); - if (ly < 14) - smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; + const dim3 block(16, 16); + const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y)); - if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) - smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; + edgesHysteresisLocalKernel<<>>(map, st1); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +////////////////////////////////////////////////////////////////////////////////////////// + +namespace canny +{ + __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; + __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; + + __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count) + { + const int stack_size = 512; + + __shared__ int s_counter; + __shared__ int s_ind; + __shared__ ushort2 s_st[stack_size]; + + if (threadIdx.x == 0) + s_counter = 0; + + __syncthreads(); + + int ind = blockIdx.y * gridDim.x + blockIdx.x; + + if (ind >= count) + return; + + ushort2 pos = st1[ind]; + + if (threadIdx.x < 8) + { + pos.x += c_dx[threadIdx.x]; + pos.y += c_dy[threadIdx.x]; + + if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1) + { + map(pos.y, pos.x) = 2; + + ind = Emulation::smem::atomicAdd(&s_counter, 1); + + s_st[ind] = pos; + } + } + + __syncthreads(); + + while (s_counter > 0 && s_counter <= stack_size - blockDim.x) + { + const int subTaskIdx = threadIdx.x >> 3; + const int portion = ::min(s_counter, blockDim.x >> 3); + + if (subTaskIdx < portion) + pos = s_st[s_counter - 1 - subTaskIdx]; __syncthreads(); - if (i < rows && j < cols) - { - int x = dx.ptr(i)[j]; - int y = dy.ptr(i)[j]; - const int s = (x ^ y) < 0 ? -1 : 1; - const float m = smem[threadIdx.y + 1][threadIdx.x + 1]; - - x = ::abs(x); - y = ::abs(y); - - // 0 - the pixel can not belong to an edge - // 1 - the pixel might belong to an edge - // 2 - the pixel does belong to an edge - int edge_type = 0; - - if (m > low_thresh) - { - const int tg22x = x * TG22; - const int tg67x = tg22x + ((x + x) << CANNY_SHIFT); - - y <<= CANNY_SHIFT; - - if (y < tg22x) - { - if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2]) - edge_type = 1 + (int)(m > high_thresh); - } - else if( y > tg67x ) - { - if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1]) - edge_type = 1 + (int)(m > high_thresh); - } - else - { - if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s]) - edge_type = 1 + (int)(m > high_thresh); - } - } - - map.ptr(i + 1)[j + 1] = edge_type; - } - } - - #undef CANNY_SHIFT - #undef TG22 - - void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) - { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - - calcMap<<>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - - ////////////////////////////////////////////////////////////////////////////////////////// - - __device__ unsigned int counter = 0; - - __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols) - { - #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120) - - __shared__ int smem[18][18]; - - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; - - const int tid = threadIdx.y * 16 + threadIdx.x; - const int lx = tid % 18; - const int ly = tid / 18; - - if (ly < 14) - smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; - - if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) - smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; - - __syncthreads(); - - if (i < rows && j < cols) - { - int n; - - #pragma unroll - for (int k = 0; k < 16; ++k) - { - n = 0; - - if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) - { - n += smem[threadIdx.y ][threadIdx.x ] == 2; - n += smem[threadIdx.y ][threadIdx.x + 1] == 2; - n += smem[threadIdx.y ][threadIdx.x + 2] == 2; - - n += smem[threadIdx.y + 1][threadIdx.x ] == 2; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; - - n += smem[threadIdx.y + 2][threadIdx.x ] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; - } - - if (n > 0) - smem[threadIdx.y + 1][threadIdx.x + 1] = 2; - } - - const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; - - map.ptr(i + 1)[j + 1] = e; - - n = 0; - - if (e == 2) - { - n += smem[threadIdx.y ][threadIdx.x ] == 1; - n += smem[threadIdx.y ][threadIdx.x + 1] == 1; - n += smem[threadIdx.y ][threadIdx.x + 2] == 1; - - n += smem[threadIdx.y + 1][threadIdx.x ] == 1; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; - - n += smem[threadIdx.y + 2][threadIdx.x ] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; - } - - if (n > 0) - { - const unsigned int ind = atomicInc(&counter, (unsigned int)(-1)); - st[ind] = make_ushort2(j + 1, i + 1); - } - } - - #endif - } - - void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols) - { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); - - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - - edgesHysteresisLocal<<>>(map, st1, rows, cols); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - - __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; - __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; - - __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count) - { - #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120 - - const int stack_size = 512; - - __shared__ unsigned int s_counter; - __shared__ unsigned int s_ind; - __shared__ ushort2 s_st[stack_size]; - if (threadIdx.x == 0) - s_counter = 0; + s_counter -= portion; + __syncthreads(); - int ind = blockIdx.y * gridDim.x + blockIdx.x; - - if (ind < count) + if (subTaskIdx < portion) { - ushort2 pos = st1[ind]; + pos.x += c_dx[threadIdx.x & 7]; + pos.y += c_dy[threadIdx.x & 7]; - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) + if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1) { - if (threadIdx.x < 8) - { - pos.x += c_dx[threadIdx.x]; - pos.y += c_dy[threadIdx.x]; + map(pos.y, pos.x) = 2; - if (map.ptr(pos.y)[pos.x] == 1) - { - map.ptr(pos.y)[pos.x] = 2; + ind = Emulation::smem::atomicAdd(&s_counter, 1); - ind = atomicInc(&s_counter, (unsigned int)(-1)); - - s_st[ind] = pos; - } - } - __syncthreads(); - - while (s_counter > 0 && s_counter <= stack_size - blockDim.x) - { - const int subTaskIdx = threadIdx.x >> 3; - const int portion = ::min(s_counter, blockDim.x >> 3); - - pos.x = pos.y = 0; - - if (subTaskIdx < portion) - pos = s_st[s_counter - 1 - subTaskIdx]; - __syncthreads(); - - if (threadIdx.x == 0) - s_counter -= portion; - __syncthreads(); - - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) - { - pos.x += c_dx[threadIdx.x & 7]; - pos.y += c_dy[threadIdx.x & 7]; - - if (map.ptr(pos.y)[pos.x] == 1) - { - map.ptr(pos.y)[pos.x] = 2; - - ind = atomicInc(&s_counter, (unsigned int)(-1)); - - s_st[ind] = pos; - } - } - __syncthreads(); - } - - if (s_counter > 0) - { - if (threadIdx.x == 0) - { - ind = atomicAdd(&counter, s_counter); - s_ind = ind - s_counter; - } - __syncthreads(); - - ind = s_ind; - - for (int i = threadIdx.x; i < s_counter; i += blockDim.x) - { - st2[ind + i] = s_st[i]; - } - } + s_st[ind] = pos; } } - #endif + __syncthreads(); } - void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols) + if (s_counter > 0) { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - - unsigned int count; - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); - - while (count > 0) + if (threadIdx.x == 0) { - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); - - dim3 block(128, 1, 1); - dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1); - edgesHysteresisGlobal<<>>(map, st1, st2, rows, cols, count); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); - - std::swap(st1, st2); + ind = ::atomicAdd(&counter, s_counter); + s_ind = ind - s_counter; } + + __syncthreads(); + + ind = s_ind; + + for (int i = threadIdx.x; i < s_counter; i += blockDim.x) + st2[ind + i] = s_st[i]; } + } - __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols) + void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2) + { + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) ); + + int count; + cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); + + while (count > 0) { - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); - if (i < rows && j < cols) - dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1)); - } + const dim3 block(128); + const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1); - void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols) - { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - - getEdges<<>>(map, dst, rows, cols); + edgesHysteresisGlobalKernel<<>>(map, st1, st2, count); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); + + cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); + + std::swap(st1, st2); } - } // namespace canny -}}} // namespace cv { namespace gpu { namespace device + } +} +////////////////////////////////////////////////////////////////////////////////////////// -#endif /* CUDA_DISABLER */ \ No newline at end of file +namespace canny +{ + struct GetEdges : unary_function + { + __device__ __forceinline__ uchar operator ()(int e) const + { + return (uchar)(-(e >> 1)); + } + + __device__ __forceinline__ GetEdges() {} + __device__ __forceinline__ GetEdges(const GetEdges&) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_shift = 4 }; + }; +}}} + +namespace canny +{ + void getEdges(PtrStepSzi map, PtrStepSzb dst) + { + transform(map, dst, GetEdges(), WithOutMask(), 0); + } +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/ccomponetns.cu b/modules/gpu/src/cuda/ccomponetns.cu index 62e81376aa..c094e08c0e 100644 --- a/modules/gpu/src/cuda/ccomponetns.cu +++ b/modules/gpu/src/cuda/ccomponetns.cu @@ -497,6 +497,7 @@ namespace cv { namespace gpu { namespace device void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream) { + (void) flags; dim3 block(CTA_SIZE_X, CTA_SIZE_Y); dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS)); @@ -529,4 +530,4 @@ namespace cv { namespace gpu { namespace device } } } } -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.0.cu b/modules/gpu/src/cuda/column_filter.0.cu new file mode 100644 index 0000000000..c35c6ee64d --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.0.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.1.cu b/modules/gpu/src/cuda/column_filter.1.cu new file mode 100644 index 0000000000..9a2d6a0427 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.1.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.10.cu b/modules/gpu/src/cuda/column_filter.10.cu new file mode 100644 index 0000000000..41e35bc1c6 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.10.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.11.cu b/modules/gpu/src/cuda/column_filter.11.cu new file mode 100644 index 0000000000..981208a68b --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.11.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.12.cu b/modules/gpu/src/cuda/column_filter.12.cu new file mode 100644 index 0000000000..13d2e60023 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.12.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.13.cu b/modules/gpu/src/cuda/column_filter.13.cu new file mode 100644 index 0000000000..09f6484af4 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.13.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.14.cu b/modules/gpu/src/cuda/column_filter.14.cu new file mode 100644 index 0000000000..901ab03011 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.14.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.2.cu b/modules/gpu/src/cuda/column_filter.2.cu new file mode 100644 index 0000000000..05ee01c763 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.2.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.3.cu b/modules/gpu/src/cuda/column_filter.3.cu new file mode 100644 index 0000000000..1bf49219f9 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.3.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.4.cu b/modules/gpu/src/cuda/column_filter.4.cu new file mode 100644 index 0000000000..bec7a085a0 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.4.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.5.cu b/modules/gpu/src/cuda/column_filter.5.cu new file mode 100644 index 0000000000..8194ee39aa --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.5.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.6.cu b/modules/gpu/src/cuda/column_filter.6.cu new file mode 100644 index 0000000000..d8fc49be68 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.6.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.7.cu b/modules/gpu/src/cuda/column_filter.7.cu new file mode 100644 index 0000000000..534bd821ef --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.7.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.8.cu b/modules/gpu/src/cuda/column_filter.8.cu new file mode 100644 index 0000000000..38e70e772e --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.8.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.9.cu b/modules/gpu/src/cuda/column_filter.9.cu new file mode 100644 index 0000000000..5b58345820 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.9.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu deleted file mode 100644 index af7369ad5e..0000000000 --- a/modules/gpu/src/cuda/column_filter.cu +++ /dev/null @@ -1,391 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if !defined CUDA_DISABLER - -#include "internal_shared.hpp" -#include "opencv2/gpu/device/saturate_cast.hpp" -#include "opencv2/gpu/device/vec_math.hpp" -#include "opencv2/gpu/device/limits.hpp" -#include "opencv2/gpu/device/border_interpolate.hpp" -#include "opencv2/gpu/device/static_check.hpp" - -namespace cv { namespace gpu { namespace device -{ - namespace column_filter - { - #define MAX_KERNEL_SIZE 32 - - __constant__ float c_kernel[MAX_KERNEL_SIZE]; - - void loadKernel(const float* kernel, int ksize, cudaStream_t stream) - { - if (stream == 0) - cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); - else - cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); - } - - template - __global__ void linearColumnFilter(const PtrStepSz src, PtrStep dst, const int anchor, const B brd) - { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) - const int BLOCK_DIM_X = 16; - const int BLOCK_DIM_Y = 16; - const int PATCH_PER_BLOCK = 4; - const int HALO_SIZE = KSIZE <= 16 ? 1 : 2; - #else - const int BLOCK_DIM_X = 16; - const int BLOCK_DIM_Y = 8; - const int PATCH_PER_BLOCK = 2; - const int HALO_SIZE = 2; - #endif - - typedef typename TypeVec::cn>::vec_type sum_t; - - __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X]; - - const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; - - if (x >= src.cols) - return; - - const T* src_col = src.ptr() + x; - - const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; - - if (blockIdx.y > 0) - { - //Upper halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x)); - } - else - { - //Upper halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); - } - - if (blockIdx.y + 2 < gridDim.y) - { - //Main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart + j * BLOCK_DIM_Y, x)); - - //Lower halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x)); - } - else - { - //Main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); - - //Lower halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); - } - - __syncthreads(); - - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - { - const int y = yStart + j * BLOCK_DIM_Y; - - if (y < src.rows) - { - sum_t sum = VecTraits::all(0); - - #pragma unroll - for (int k = 0; k < KSIZE; ++k) - sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; - - dst(y, x) = saturate_cast(sum); - } - } - } - - template class B> - void linearColumnFilter_caller(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream) - { - int BLOCK_DIM_X; - int BLOCK_DIM_Y; - int PATCH_PER_BLOCK; - - if (cc >= 20) - { - BLOCK_DIM_X = 16; - BLOCK_DIM_Y = 16; - PATCH_PER_BLOCK = 4; - } - else - { - BLOCK_DIM_X = 16; - BLOCK_DIM_Y = 8; - PATCH_PER_BLOCK = 2; - } - - const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); - const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); - - B brd(src.rows); - - linearColumnFilter<<>>(src, dst, anchor, brd); - - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - template - void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) - { - typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream); - - static const caller_t callers[5][33] = - { - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColReflect101>, - linearColumnFilter_caller< 2, T, D, BrdColReflect101>, - linearColumnFilter_caller< 3, T, D, BrdColReflect101>, - linearColumnFilter_caller< 4, T, D, BrdColReflect101>, - linearColumnFilter_caller< 5, T, D, BrdColReflect101>, - linearColumnFilter_caller< 6, T, D, BrdColReflect101>, - linearColumnFilter_caller< 7, T, D, BrdColReflect101>, - linearColumnFilter_caller< 8, T, D, BrdColReflect101>, - linearColumnFilter_caller< 9, T, D, BrdColReflect101>, - linearColumnFilter_caller<10, T, D, BrdColReflect101>, - linearColumnFilter_caller<11, T, D, BrdColReflect101>, - linearColumnFilter_caller<12, T, D, BrdColReflect101>, - linearColumnFilter_caller<13, T, D, BrdColReflect101>, - linearColumnFilter_caller<14, T, D, BrdColReflect101>, - linearColumnFilter_caller<15, T, D, BrdColReflect101>, - linearColumnFilter_caller<16, T, D, BrdColReflect101>, - linearColumnFilter_caller<17, T, D, BrdColReflect101>, - linearColumnFilter_caller<18, T, D, BrdColReflect101>, - linearColumnFilter_caller<19, T, D, BrdColReflect101>, - linearColumnFilter_caller<20, T, D, BrdColReflect101>, - linearColumnFilter_caller<21, T, D, BrdColReflect101>, - linearColumnFilter_caller<22, T, D, BrdColReflect101>, - linearColumnFilter_caller<23, T, D, BrdColReflect101>, - linearColumnFilter_caller<24, T, D, BrdColReflect101>, - linearColumnFilter_caller<25, T, D, BrdColReflect101>, - linearColumnFilter_caller<26, T, D, BrdColReflect101>, - linearColumnFilter_caller<27, T, D, BrdColReflect101>, - linearColumnFilter_caller<28, T, D, BrdColReflect101>, - linearColumnFilter_caller<29, T, D, BrdColReflect101>, - linearColumnFilter_caller<30, T, D, BrdColReflect101>, - linearColumnFilter_caller<31, T, D, BrdColReflect101>, - linearColumnFilter_caller<32, T, D, BrdColReflect101> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColReplicate>, - linearColumnFilter_caller< 2, T, D, BrdColReplicate>, - linearColumnFilter_caller< 3, T, D, BrdColReplicate>, - linearColumnFilter_caller< 4, T, D, BrdColReplicate>, - linearColumnFilter_caller< 5, T, D, BrdColReplicate>, - linearColumnFilter_caller< 6, T, D, BrdColReplicate>, - linearColumnFilter_caller< 7, T, D, BrdColReplicate>, - linearColumnFilter_caller< 8, T, D, BrdColReplicate>, - linearColumnFilter_caller< 9, T, D, BrdColReplicate>, - linearColumnFilter_caller<10, T, D, BrdColReplicate>, - linearColumnFilter_caller<11, T, D, BrdColReplicate>, - linearColumnFilter_caller<12, T, D, BrdColReplicate>, - linearColumnFilter_caller<13, T, D, BrdColReplicate>, - linearColumnFilter_caller<14, T, D, BrdColReplicate>, - linearColumnFilter_caller<15, T, D, BrdColReplicate>, - linearColumnFilter_caller<16, T, D, BrdColReplicate>, - linearColumnFilter_caller<17, T, D, BrdColReplicate>, - linearColumnFilter_caller<18, T, D, BrdColReplicate>, - linearColumnFilter_caller<19, T, D, BrdColReplicate>, - linearColumnFilter_caller<20, T, D, BrdColReplicate>, - linearColumnFilter_caller<21, T, D, BrdColReplicate>, - linearColumnFilter_caller<22, T, D, BrdColReplicate>, - linearColumnFilter_caller<23, T, D, BrdColReplicate>, - linearColumnFilter_caller<24, T, D, BrdColReplicate>, - linearColumnFilter_caller<25, T, D, BrdColReplicate>, - linearColumnFilter_caller<26, T, D, BrdColReplicate>, - linearColumnFilter_caller<27, T, D, BrdColReplicate>, - linearColumnFilter_caller<28, T, D, BrdColReplicate>, - linearColumnFilter_caller<29, T, D, BrdColReplicate>, - linearColumnFilter_caller<30, T, D, BrdColReplicate>, - linearColumnFilter_caller<31, T, D, BrdColReplicate>, - linearColumnFilter_caller<32, T, D, BrdColReplicate> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColConstant>, - linearColumnFilter_caller< 2, T, D, BrdColConstant>, - linearColumnFilter_caller< 3, T, D, BrdColConstant>, - linearColumnFilter_caller< 4, T, D, BrdColConstant>, - linearColumnFilter_caller< 5, T, D, BrdColConstant>, - linearColumnFilter_caller< 6, T, D, BrdColConstant>, - linearColumnFilter_caller< 7, T, D, BrdColConstant>, - linearColumnFilter_caller< 8, T, D, BrdColConstant>, - linearColumnFilter_caller< 9, T, D, BrdColConstant>, - linearColumnFilter_caller<10, T, D, BrdColConstant>, - linearColumnFilter_caller<11, T, D, BrdColConstant>, - linearColumnFilter_caller<12, T, D, BrdColConstant>, - linearColumnFilter_caller<13, T, D, BrdColConstant>, - linearColumnFilter_caller<14, T, D, BrdColConstant>, - linearColumnFilter_caller<15, T, D, BrdColConstant>, - linearColumnFilter_caller<16, T, D, BrdColConstant>, - linearColumnFilter_caller<17, T, D, BrdColConstant>, - linearColumnFilter_caller<18, T, D, BrdColConstant>, - linearColumnFilter_caller<19, T, D, BrdColConstant>, - linearColumnFilter_caller<20, T, D, BrdColConstant>, - linearColumnFilter_caller<21, T, D, BrdColConstant>, - linearColumnFilter_caller<22, T, D, BrdColConstant>, - linearColumnFilter_caller<23, T, D, BrdColConstant>, - linearColumnFilter_caller<24, T, D, BrdColConstant>, - linearColumnFilter_caller<25, T, D, BrdColConstant>, - linearColumnFilter_caller<26, T, D, BrdColConstant>, - linearColumnFilter_caller<27, T, D, BrdColConstant>, - linearColumnFilter_caller<28, T, D, BrdColConstant>, - linearColumnFilter_caller<29, T, D, BrdColConstant>, - linearColumnFilter_caller<30, T, D, BrdColConstant>, - linearColumnFilter_caller<31, T, D, BrdColConstant>, - linearColumnFilter_caller<32, T, D, BrdColConstant> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColReflect>, - linearColumnFilter_caller< 2, T, D, BrdColReflect>, - linearColumnFilter_caller< 3, T, D, BrdColReflect>, - linearColumnFilter_caller< 4, T, D, BrdColReflect>, - linearColumnFilter_caller< 5, T, D, BrdColReflect>, - linearColumnFilter_caller< 6, T, D, BrdColReflect>, - linearColumnFilter_caller< 7, T, D, BrdColReflect>, - linearColumnFilter_caller< 8, T, D, BrdColReflect>, - linearColumnFilter_caller< 9, T, D, BrdColReflect>, - linearColumnFilter_caller<10, T, D, BrdColReflect>, - linearColumnFilter_caller<11, T, D, BrdColReflect>, - linearColumnFilter_caller<12, T, D, BrdColReflect>, - linearColumnFilter_caller<13, T, D, BrdColReflect>, - linearColumnFilter_caller<14, T, D, BrdColReflect>, - linearColumnFilter_caller<15, T, D, BrdColReflect>, - linearColumnFilter_caller<16, T, D, BrdColReflect>, - linearColumnFilter_caller<17, T, D, BrdColReflect>, - linearColumnFilter_caller<18, T, D, BrdColReflect>, - linearColumnFilter_caller<19, T, D, BrdColReflect>, - linearColumnFilter_caller<20, T, D, BrdColReflect>, - linearColumnFilter_caller<21, T, D, BrdColReflect>, - linearColumnFilter_caller<22, T, D, BrdColReflect>, - linearColumnFilter_caller<23, T, D, BrdColReflect>, - linearColumnFilter_caller<24, T, D, BrdColReflect>, - linearColumnFilter_caller<25, T, D, BrdColReflect>, - linearColumnFilter_caller<26, T, D, BrdColReflect>, - linearColumnFilter_caller<27, T, D, BrdColReflect>, - linearColumnFilter_caller<28, T, D, BrdColReflect>, - linearColumnFilter_caller<29, T, D, BrdColReflect>, - linearColumnFilter_caller<30, T, D, BrdColReflect>, - linearColumnFilter_caller<31, T, D, BrdColReflect>, - linearColumnFilter_caller<32, T, D, BrdColReflect> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColWrap>, - linearColumnFilter_caller< 2, T, D, BrdColWrap>, - linearColumnFilter_caller< 3, T, D, BrdColWrap>, - linearColumnFilter_caller< 4, T, D, BrdColWrap>, - linearColumnFilter_caller< 5, T, D, BrdColWrap>, - linearColumnFilter_caller< 6, T, D, BrdColWrap>, - linearColumnFilter_caller< 7, T, D, BrdColWrap>, - linearColumnFilter_caller< 8, T, D, BrdColWrap>, - linearColumnFilter_caller< 9, T, D, BrdColWrap>, - linearColumnFilter_caller<10, T, D, BrdColWrap>, - linearColumnFilter_caller<11, T, D, BrdColWrap>, - linearColumnFilter_caller<12, T, D, BrdColWrap>, - linearColumnFilter_caller<13, T, D, BrdColWrap>, - linearColumnFilter_caller<14, T, D, BrdColWrap>, - linearColumnFilter_caller<15, T, D, BrdColWrap>, - linearColumnFilter_caller<16, T, D, BrdColWrap>, - linearColumnFilter_caller<17, T, D, BrdColWrap>, - linearColumnFilter_caller<18, T, D, BrdColWrap>, - linearColumnFilter_caller<19, T, D, BrdColWrap>, - linearColumnFilter_caller<20, T, D, BrdColWrap>, - linearColumnFilter_caller<21, T, D, BrdColWrap>, - linearColumnFilter_caller<22, T, D, BrdColWrap>, - linearColumnFilter_caller<23, T, D, BrdColWrap>, - linearColumnFilter_caller<24, T, D, BrdColWrap>, - linearColumnFilter_caller<25, T, D, BrdColWrap>, - linearColumnFilter_caller<26, T, D, BrdColWrap>, - linearColumnFilter_caller<27, T, D, BrdColWrap>, - linearColumnFilter_caller<28, T, D, BrdColWrap>, - linearColumnFilter_caller<29, T, D, BrdColWrap>, - linearColumnFilter_caller<30, T, D, BrdColWrap>, - linearColumnFilter_caller<31, T, D, BrdColWrap>, - linearColumnFilter_caller<32, T, D, BrdColWrap> - } - }; - - loadKernel(kernel, ksize, stream); - - callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, anchor, cc, stream); - } - - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - } // namespace column_filter -}}} // namespace cv { namespace gpu { namespace device - - -#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.h b/modules/gpu/src/cuda/column_filter.h new file mode 100644 index 0000000000..52b9103393 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.h @@ -0,0 +1,373 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/vec_math.hpp" +#include "opencv2/gpu/device/border_interpolate.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace column_filter +{ + #define MAX_KERNEL_SIZE 32 + + __constant__ float c_kernel[MAX_KERNEL_SIZE]; + + template + __global__ void linearColumnFilter(const PtrStepSz src, PtrStep dst, const int anchor, const B brd) + { + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) + const int BLOCK_DIM_X = 16; + const int BLOCK_DIM_Y = 16; + const int PATCH_PER_BLOCK = 4; + const int HALO_SIZE = KSIZE <= 16 ? 1 : 2; + #else + const int BLOCK_DIM_X = 16; + const int BLOCK_DIM_Y = 8; + const int PATCH_PER_BLOCK = 2; + const int HALO_SIZE = 2; + #endif + + typedef typename TypeVec::cn>::vec_type sum_t; + + __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X]; + + const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; + + if (x >= src.cols) + return; + + const T* src_col = src.ptr() + x; + + const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; + + if (blockIdx.y > 0) + { + //Upper halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x)); + } + else + { + //Upper halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); + } + + if (blockIdx.y + 2 < gridDim.y) + { + //Main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart + j * BLOCK_DIM_Y, x)); + + //Lower halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x)); + } + else + { + //Main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); + + //Lower halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + { + const int y = yStart + j * BLOCK_DIM_Y; + + if (y < src.rows) + { + sum_t sum = VecTraits::all(0); + + #pragma unroll + for (int k = 0; k < KSIZE; ++k) + sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; + + dst(y, x) = saturate_cast(sum); + } + } + } + + template class B> + void caller(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream) + { + int BLOCK_DIM_X; + int BLOCK_DIM_Y; + int PATCH_PER_BLOCK; + + if (cc >= 20) + { + BLOCK_DIM_X = 16; + BLOCK_DIM_Y = 16; + PATCH_PER_BLOCK = 4; + } + else + { + BLOCK_DIM_X = 16; + BLOCK_DIM_Y = 8; + PATCH_PER_BLOCK = 2; + } + + const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); + const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); + + B brd(src.rows); + + linearColumnFilter<<>>(src, dst, anchor, brd); + + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +namespace filter +{ + template + void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) + { + typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream); + + static const caller_t callers[5][33] = + { + { + 0, + column_filter::caller< 1, T, D, BrdColReflect101>, + column_filter::caller< 2, T, D, BrdColReflect101>, + column_filter::caller< 3, T, D, BrdColReflect101>, + column_filter::caller< 4, T, D, BrdColReflect101>, + column_filter::caller< 5, T, D, BrdColReflect101>, + column_filter::caller< 6, T, D, BrdColReflect101>, + column_filter::caller< 7, T, D, BrdColReflect101>, + column_filter::caller< 8, T, D, BrdColReflect101>, + column_filter::caller< 9, T, D, BrdColReflect101>, + column_filter::caller<10, T, D, BrdColReflect101>, + column_filter::caller<11, T, D, BrdColReflect101>, + column_filter::caller<12, T, D, BrdColReflect101>, + column_filter::caller<13, T, D, BrdColReflect101>, + column_filter::caller<14, T, D, BrdColReflect101>, + column_filter::caller<15, T, D, BrdColReflect101>, + column_filter::caller<16, T, D, BrdColReflect101>, + column_filter::caller<17, T, D, BrdColReflect101>, + column_filter::caller<18, T, D, BrdColReflect101>, + column_filter::caller<19, T, D, BrdColReflect101>, + column_filter::caller<20, T, D, BrdColReflect101>, + column_filter::caller<21, T, D, BrdColReflect101>, + column_filter::caller<22, T, D, BrdColReflect101>, + column_filter::caller<23, T, D, BrdColReflect101>, + column_filter::caller<24, T, D, BrdColReflect101>, + column_filter::caller<25, T, D, BrdColReflect101>, + column_filter::caller<26, T, D, BrdColReflect101>, + column_filter::caller<27, T, D, BrdColReflect101>, + column_filter::caller<28, T, D, BrdColReflect101>, + column_filter::caller<29, T, D, BrdColReflect101>, + column_filter::caller<30, T, D, BrdColReflect101>, + column_filter::caller<31, T, D, BrdColReflect101>, + column_filter::caller<32, T, D, BrdColReflect101> + }, + { + 0, + column_filter::caller< 1, T, D, BrdColReplicate>, + column_filter::caller< 2, T, D, BrdColReplicate>, + column_filter::caller< 3, T, D, BrdColReplicate>, + column_filter::caller< 4, T, D, BrdColReplicate>, + column_filter::caller< 5, T, D, BrdColReplicate>, + column_filter::caller< 6, T, D, BrdColReplicate>, + column_filter::caller< 7, T, D, BrdColReplicate>, + column_filter::caller< 8, T, D, BrdColReplicate>, + column_filter::caller< 9, T, D, BrdColReplicate>, + column_filter::caller<10, T, D, BrdColReplicate>, + column_filter::caller<11, T, D, BrdColReplicate>, + column_filter::caller<12, T, D, BrdColReplicate>, + column_filter::caller<13, T, D, BrdColReplicate>, + column_filter::caller<14, T, D, BrdColReplicate>, + column_filter::caller<15, T, D, BrdColReplicate>, + column_filter::caller<16, T, D, BrdColReplicate>, + column_filter::caller<17, T, D, BrdColReplicate>, + column_filter::caller<18, T, D, BrdColReplicate>, + column_filter::caller<19, T, D, BrdColReplicate>, + column_filter::caller<20, T, D, BrdColReplicate>, + column_filter::caller<21, T, D, BrdColReplicate>, + column_filter::caller<22, T, D, BrdColReplicate>, + column_filter::caller<23, T, D, BrdColReplicate>, + column_filter::caller<24, T, D, BrdColReplicate>, + column_filter::caller<25, T, D, BrdColReplicate>, + column_filter::caller<26, T, D, BrdColReplicate>, + column_filter::caller<27, T, D, BrdColReplicate>, + column_filter::caller<28, T, D, BrdColReplicate>, + column_filter::caller<29, T, D, BrdColReplicate>, + column_filter::caller<30, T, D, BrdColReplicate>, + column_filter::caller<31, T, D, BrdColReplicate>, + column_filter::caller<32, T, D, BrdColReplicate> + }, + { + 0, + column_filter::caller< 1, T, D, BrdColConstant>, + column_filter::caller< 2, T, D, BrdColConstant>, + column_filter::caller< 3, T, D, BrdColConstant>, + column_filter::caller< 4, T, D, BrdColConstant>, + column_filter::caller< 5, T, D, BrdColConstant>, + column_filter::caller< 6, T, D, BrdColConstant>, + column_filter::caller< 7, T, D, BrdColConstant>, + column_filter::caller< 8, T, D, BrdColConstant>, + column_filter::caller< 9, T, D, BrdColConstant>, + column_filter::caller<10, T, D, BrdColConstant>, + column_filter::caller<11, T, D, BrdColConstant>, + column_filter::caller<12, T, D, BrdColConstant>, + column_filter::caller<13, T, D, BrdColConstant>, + column_filter::caller<14, T, D, BrdColConstant>, + column_filter::caller<15, T, D, BrdColConstant>, + column_filter::caller<16, T, D, BrdColConstant>, + column_filter::caller<17, T, D, BrdColConstant>, + column_filter::caller<18, T, D, BrdColConstant>, + column_filter::caller<19, T, D, BrdColConstant>, + column_filter::caller<20, T, D, BrdColConstant>, + column_filter::caller<21, T, D, BrdColConstant>, + column_filter::caller<22, T, D, BrdColConstant>, + column_filter::caller<23, T, D, BrdColConstant>, + column_filter::caller<24, T, D, BrdColConstant>, + column_filter::caller<25, T, D, BrdColConstant>, + column_filter::caller<26, T, D, BrdColConstant>, + column_filter::caller<27, T, D, BrdColConstant>, + column_filter::caller<28, T, D, BrdColConstant>, + column_filter::caller<29, T, D, BrdColConstant>, + column_filter::caller<30, T, D, BrdColConstant>, + column_filter::caller<31, T, D, BrdColConstant>, + column_filter::caller<32, T, D, BrdColConstant> + }, + { + 0, + column_filter::caller< 1, T, D, BrdColReflect>, + column_filter::caller< 2, T, D, BrdColReflect>, + column_filter::caller< 3, T, D, BrdColReflect>, + column_filter::caller< 4, T, D, BrdColReflect>, + column_filter::caller< 5, T, D, BrdColReflect>, + column_filter::caller< 6, T, D, BrdColReflect>, + column_filter::caller< 7, T, D, BrdColReflect>, + column_filter::caller< 8, T, D, BrdColReflect>, + column_filter::caller< 9, T, D, BrdColReflect>, + column_filter::caller<10, T, D, BrdColReflect>, + column_filter::caller<11, T, D, BrdColReflect>, + column_filter::caller<12, T, D, BrdColReflect>, + column_filter::caller<13, T, D, BrdColReflect>, + column_filter::caller<14, T, D, BrdColReflect>, + column_filter::caller<15, T, D, BrdColReflect>, + column_filter::caller<16, T, D, BrdColReflect>, + column_filter::caller<17, T, D, BrdColReflect>, + column_filter::caller<18, T, D, BrdColReflect>, + column_filter::caller<19, T, D, BrdColReflect>, + column_filter::caller<20, T, D, BrdColReflect>, + column_filter::caller<21, T, D, BrdColReflect>, + column_filter::caller<22, T, D, BrdColReflect>, + column_filter::caller<23, T, D, BrdColReflect>, + column_filter::caller<24, T, D, BrdColReflect>, + column_filter::caller<25, T, D, BrdColReflect>, + column_filter::caller<26, T, D, BrdColReflect>, + column_filter::caller<27, T, D, BrdColReflect>, + column_filter::caller<28, T, D, BrdColReflect>, + column_filter::caller<29, T, D, BrdColReflect>, + column_filter::caller<30, T, D, BrdColReflect>, + column_filter::caller<31, T, D, BrdColReflect>, + column_filter::caller<32, T, D, BrdColReflect> + }, + { + 0, + column_filter::caller< 1, T, D, BrdColWrap>, + column_filter::caller< 2, T, D, BrdColWrap>, + column_filter::caller< 3, T, D, BrdColWrap>, + column_filter::caller< 4, T, D, BrdColWrap>, + column_filter::caller< 5, T, D, BrdColWrap>, + column_filter::caller< 6, T, D, BrdColWrap>, + column_filter::caller< 7, T, D, BrdColWrap>, + column_filter::caller< 8, T, D, BrdColWrap>, + column_filter::caller< 9, T, D, BrdColWrap>, + column_filter::caller<10, T, D, BrdColWrap>, + column_filter::caller<11, T, D, BrdColWrap>, + column_filter::caller<12, T, D, BrdColWrap>, + column_filter::caller<13, T, D, BrdColWrap>, + column_filter::caller<14, T, D, BrdColWrap>, + column_filter::caller<15, T, D, BrdColWrap>, + column_filter::caller<16, T, D, BrdColWrap>, + column_filter::caller<17, T, D, BrdColWrap>, + column_filter::caller<18, T, D, BrdColWrap>, + column_filter::caller<19, T, D, BrdColWrap>, + column_filter::caller<20, T, D, BrdColWrap>, + column_filter::caller<21, T, D, BrdColWrap>, + column_filter::caller<22, T, D, BrdColWrap>, + column_filter::caller<23, T, D, BrdColWrap>, + column_filter::caller<24, T, D, BrdColWrap>, + column_filter::caller<25, T, D, BrdColWrap>, + column_filter::caller<26, T, D, BrdColWrap>, + column_filter::caller<27, T, D, BrdColWrap>, + column_filter::caller<28, T, D, BrdColWrap>, + column_filter::caller<29, T, D, BrdColWrap>, + column_filter::caller<30, T, D, BrdColWrap>, + column_filter::caller<31, T, D, BrdColWrap>, + column_filter::caller<32, T, D, BrdColWrap> + } + }; + + if (stream == 0) + cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); + else + cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); + + callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, anchor, cc, stream); + } +} diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index c61601d4f7..27fb61ff70 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -42,405 +42,875 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" -namespace cv { namespace gpu { namespace device -{ - ////////////////////////////////////////////////////////////////////////// - // add +using namespace cv::gpu; +using namespace cv::gpu::device; - template struct Add : binary_function +namespace arithm +{ + template struct ArithmFuncTraits + { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 1 }; + }; + + template <> struct ArithmFuncTraits<1, 1> + { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct ArithmFuncTraits<1, 2> + { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct ArithmFuncTraits<1, 4> + { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template <> struct ArithmFuncTraits<2, 1> + { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct ArithmFuncTraits<2, 2> + { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct ArithmFuncTraits<2, 4> + { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template <> struct ArithmFuncTraits<4, 1> + { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct ArithmFuncTraits<4, 2> + { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct ArithmFuncTraits<4, 4> + { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +} + +////////////////////////////////////////////////////////////////////////// +// addMat + +namespace arithm +{ + template struct VAdd4; + template <> struct VAdd4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + template <> struct VAdd4 : binary_function + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + template <> struct VAdd4 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + template <> struct VAdd4 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + + //////////////////////////////////// + + template struct VAdd2; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + + //////////////////////////////////// + + template struct AddMat : binary_function { __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast(a + b); } + + __device__ __forceinline__ AddMat() {} + __device__ __forceinline__ AddMat(const AddMat& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits + { }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + //////////////////////////////////// + + template struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + //////////////////////////////////// + + template struct TransformFunctorTraits< arithm::AddMat > : arithm::ArithmFuncTraits { - if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Add(), SingleMask(mask), stream); - else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Add(), WithOutMask(), stream); + }; +}}} + +namespace arithm +{ + template + void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAdd4(), WithOutMask(), stream); } - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - template struct AddScalar : unary_function + template + void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - AddScalar(double val_) : val(val_) {} + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAdd2(), WithOutMask(), stream); + } + + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template + void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + if (mask.data) + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AddMat(), mask, stream); + else + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AddMat(), WithOutMask(), stream); + } + + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// addScalar + +namespace arithm +{ + template struct AddScalar : unary_function + { + S val; + + explicit AddScalar(S val_) : val(val_) {} + __device__ __forceinline__ D operator ()(T a) const { return saturate_cast(a + val); } - const double val; }; +} - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< arithm::AddScalar > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; +}}} - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) +namespace arithm +{ + template + void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - AddScalar op(val); + AddScalar op(static_cast(val)); + if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, SingleMask(mask), stream); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, mask, stream); else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); } - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} - ////////////////////////////////////////////////////////////////////////// - // subtract +////////////////////////////////////////////////////////////////////////// +// subMat - template struct Subtract : binary_function +namespace arithm +{ + template struct VSub4; + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + + //////////////////////////////////// + + template struct VSub2; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + + //////////////////////////////////// + + template struct SubMat : binary_function { __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast(a - b); } + + __device__ __forceinline__ SubMat() {} + __device__ __forceinline__ SubMat(const SubMat& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits + { }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + //////////////////////////////////// + + template struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + //////////////////////////////////// + + template struct TransformFunctorTraits< arithm::SubMat > : arithm::ArithmFuncTraits { - if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Subtract(), SingleMask(mask), stream); - else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Subtract(), WithOutMask(), stream); + }; +}}} + +namespace arithm +{ + template + void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VSub4(), WithOutMask(), stream); } - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - template struct SubtractScalar : unary_function + template + void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - SubtractScalar(double val_) : val(val_) {} - __device__ __forceinline__ D operator ()(T a) const - { - return saturate_cast(a - val); - } - const double val; - }; - - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - SubtractScalar op(val); - if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, SingleMask(mask), stream); - else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VSub2(), WithOutMask(), stream); } - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template + void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + if (mask.data) + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, SubMat(), mask, stream); + else + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, SubMat(), WithOutMask(), stream); + } - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - ////////////////////////////////////////////////////////////////////////// - // multiply + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - struct multiply_8uc4_32f : binary_function + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// subScalar + +namespace arithm +{ + template + void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + AddScalar op(-static_cast(val)); + + if (mask.data) + transform((PtrStepSz) src1, (PtrStepSz) dst, op, mask, stream); + else + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// mulMat + +namespace arithm +{ + struct Mul_8uc4_32f : binary_function { __device__ __forceinline__ uint operator ()(uint a, float b) const { @@ -453,301 +923,262 @@ namespace cv { namespace gpu { namespace device return res; } + + __device__ __forceinline__ Mul_8uc4_32f() {} + __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {} }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_8uc4_32f) - { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; - }; - - void multiply_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) - { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), multiply_8uc4_32f(), WithOutMask(), stream); - } - - struct multiply_16sc4_32f : binary_function + struct Mul_16sc4_32f : binary_function { __device__ __forceinline__ short4 operator ()(short4 a, float b) const { return make_short4(saturate_cast(a.x * b), saturate_cast(a.y * b), saturate_cast(a.z * b), saturate_cast(a.w * b)); } + + __device__ __forceinline__ Mul_16sc4_32f() {} + __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {} }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_16sc4_32f) + template struct Mul : binary_function { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; + __device__ __forceinline__ D operator ()(T a, T b) const + { + return saturate_cast(a * b); + } + + __device__ __forceinline__ Mul() {} + __device__ __forceinline__ Mul(const Mul& other) {} }; - void multiply_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) + template struct MulScale : binary_function { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), multiply_16sc4_32f(), WithOutMask(), stream); - } + S scale; + + explicit MulScale(S scale_) : scale(scale_) {} - template struct Multiply : binary_function - { - Multiply(float scale_) : scale(scale_) {} __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast(scale * a * b); } - const float scale; }; - template struct Multiply : binary_function +} + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits : arithm::ArithmFuncTraits { - Multiply(double scale_) : scale(scale_) {} - __device__ __forceinline__ double operator ()(T a, T b) const - { - return scale * a * b; - } - const double scale; - }; - template <> struct Multiply : binary_function - { - Multiply(double scale_) : scale(scale_) {} - __device__ __forceinline__ int operator ()(int a, int b) const - { - return saturate_cast(scale * a * b); - } - const double scale; }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > + template struct TransformFunctorTraits< arithm::Mul > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template struct MultiplyCaller + template struct TransformFunctorTraits< arithm::MulScale > : arithm::ArithmFuncTraits { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - Multiply op(static_cast(scale)); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); - } - }; - template struct MultiplyCaller - { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Multiply op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); - } - }; - template <> struct MultiplyCaller - { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Multiply op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); - } }; +}}} - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) +namespace arithm +{ + void mulMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) { - MultiplyCaller::call(src1, src2, dst, scale, stream); + transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream); } - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - template struct MultiplyScalar : unary_function + void mulMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) { - MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {} + transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream); + } + + template + void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream) + { + if (scale == 1) + { + Mul op; + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + } + else + { + MulScale op(static_cast(scale)); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + } + } + + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// mulScalar + +namespace arithm +{ + template struct MulScalar : unary_function + { + S val; + + explicit MulScalar(S val_) : val(val_) {} + __device__ __forceinline__ D operator ()(T a) const { - return saturate_cast(scale * a * val); + return saturate_cast(a * val); } - const double val; - const double scale; }; +} - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< arithm::MulScalar > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; +}}} - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream) +namespace arithm +{ + template + void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - MultiplyScalar op(val, scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + MulScalar op(static_cast(val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); } - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} - ////////////////////////////////////////////////////////////////////////// - // divide +////////////////////////////////////////////////////////////////////////// +// divMat - struct divide_8uc4_32f : binary_function +namespace arithm +{ + struct Div_8uc4_32f : binary_function { - __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const + __device__ __forceinline__ uint operator ()(uint a, float b) const { - return b != 0 ? make_uchar4(saturate_cast(a.x / b), saturate_cast(a.y / b), - saturate_cast(a.z / b), saturate_cast(a.w / b)) - : make_uchar4(0,0,0,0); + uint res = 0; + + if (b != 0) + { + b = 1.0f / b; + res |= (saturate_cast((0xffu & (a )) * b) ); + res |= (saturate_cast((0xffu & (a >> 8)) * b) << 8); + res |= (saturate_cast((0xffu & (a >> 16)) * b) << 16); + res |= (saturate_cast((0xffu & (a >> 24)) * b) << 24); + } + + return res; } }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_8uc4_32f) - { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; - }; - - void divide_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) - { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), divide_8uc4_32f(), WithOutMask(), stream); - } - - - struct divide_16sc4_32f : binary_function + struct Div_16sc4_32f : binary_function { __device__ __forceinline__ short4 operator ()(short4 a, float b) const { @@ -757,586 +1188,847 @@ namespace cv { namespace gpu { namespace device } }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_16sc4_32f) + template struct Div : binary_function { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; - }; - - void divide_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) - { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), divide_16sc4_32f(), WithOutMask(), stream); - } - - template struct Divide : binary_function - { - Divide(double scale_) : scale(scale_) {} __device__ __forceinline__ D operator ()(T a, T b) const { - return b != 0 ? saturate_cast(a * scale / b) : 0; + return b != 0 ? saturate_cast(a / b) : 0; } - const double scale; + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} + }; + template struct Div : binary_function + { + __device__ __forceinline__ float operator ()(T a, T b) const + { + return b != 0 ? static_cast(a) / b : 0; + } + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} + }; + template struct Div : binary_function + { + __device__ __forceinline__ double operator ()(T a, T b) const + { + return b != 0 ? static_cast(a) / b : 0; + } + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > + template struct DivScale : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + S scale; + + explicit DivScale(S scale_) : scale(scale_) {} + + __device__ __forceinline__ D operator ()(T a, T b) const + { + return b != 0 ? saturate_cast(scale * a / b) : 0; + } }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > +} + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + template struct TransformFunctorTraits< arithm::Div > : arithm::ArithmFuncTraits { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Divide op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + }; + + template struct TransformFunctorTraits< arithm::DivScale > : arithm::ArithmFuncTraits + { + }; +}}} + +namespace arithm +{ + void divMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream); } - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - template struct DivideScalar : unary_function + void divMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) { - DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {} + transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream); + } + + template + void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream) + { + if (scale == 1) + { + Div op; + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + } + else + { + DivScale op(static_cast(scale)); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + } + } + + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divScalar + +namespace arithm +{ + template + void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) + { + MulScalar op(static_cast(1.0 / val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divInv + +namespace arithm +{ + template struct DivInv : unary_function + { + S val; + + explicit DivInv(S val_) : val(val_) {} + __device__ __forceinline__ D operator ()(T a) const { - return saturate_cast(scale * a / val); + return a != 0 ? saturate_cast(val / a) : 0; } - const double val; - const double scale; }; +} - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< arithm::DivInv > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; +}}} - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream) +namespace arithm +{ + template + void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - DivideScalar op(val, scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + DivInv op(static_cast(val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); } - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} - template struct Reciprocal : unary_function +////////////////////////////////////////////////////////////////////////// +// absDiffMat + +namespace arithm +{ + template struct VAbsDiff4; + template <> struct VAbsDiff4 : binary_function { - Reciprocal(double scale_) : scale(scale_) {} - __device__ __forceinline__ D operator ()(T a) const + __device__ __forceinline__ uint operator ()(uint a, uint b) const { - return a != 0 ? saturate_cast(scale / a) : 0; + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } - const double scale; + + __device__ __forceinline__ VAbsDiff4() {} + __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} + }; + template <> struct VAbsDiff4 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAbsDiff4() {} + __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > + //////////////////////////////////// + + template struct VAbsDiff2; + template <> struct VAbsDiff2 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAbsDiff2() {} + __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {} }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > + template <> struct VAbsDiff2 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAbsDiff2() {} + __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {} }; - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream) + //////////////////////////////////// + + __device__ __forceinline__ int _abs(int a) { - cudaSafeCall( cudaSetDoubleForDevice(&scalar) ); - Reciprocal op(scalar); - cv::gpu::device::transform((PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + return ::abs(a); + } + __device__ __forceinline__ float _abs(float a) + { + return ::fabsf(a); + } + __device__ __forceinline__ double _abs(double a) + { + return ::fabs(a); } - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // absdiff - - template struct Absdiff : binary_function + template struct AbsDiffMat : binary_function { - static __device__ __forceinline__ int abs(int a) - { - return ::abs(a); - } - static __device__ __forceinline__ float abs(float a) - { - return ::fabsf(a); - } - static __device__ __forceinline__ double abs(double a) - { - return ::fabs(a); - } - __device__ __forceinline__ T operator ()(T a, T b) const { - return saturate_cast(::abs(a - b)); + return saturate_cast(_abs(a - b)); } + + __device__ __forceinline__ AbsDiffMat() {} + __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits + { }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > + //////////////////////////////////// + + template struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + //////////////////////////////////// + + template struct TransformFunctorTraits< arithm::AbsDiffMat > : arithm::ArithmFuncTraits { - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Absdiff(), WithOutMask(), stream); + }; +}}} + +namespace arithm +{ + template + void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAbsDiff4(), WithOutMask(), stream); } - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template struct AbsdiffScalar : unary_function + template + void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - AbsdiffScalar(double val_) : val(val_) {} + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAbsDiff2(), WithOutMask(), stream); + } + + template void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template + void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AbsDiffMat(), WithOutMask(), stream); + } + + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// absDiffScalar + +namespace arithm +{ + template struct AbsDiffScalar : unary_function + { + S val; + + explicit AbsDiffScalar(S val_) : val(val_) {} + __device__ __forceinline__ T operator ()(T a) const { - return saturate_cast(::fabs(a - val)); + abs_func f; + return saturate_cast(f(a - val)); } - double val; }; +} - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< arithm::AbsDiffScalar > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; +}}} - template void absdiff_gpu(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) +namespace arithm +{ + template + void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - AbsdiffScalar op(val); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + AbsDiffScalar op(static_cast(val)); + + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); } - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); +} - ////////////////////////////////////////////////////////////////////////////////////// - // Compare +////////////////////////////////////////////////////////////////////////// +// absMat - template