diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34ea764f8a..9b7f532ac5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,14 +110,15 @@ endif()
 
 # Optional 3rd party components
 # ===================================================
-OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (UNIX AND NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (UNIX AND NOT ANDROID AND NOT IOS AND NOT CARMA) )
 OCV_OPTION(WITH_AVFOUNDATION   "Use AVFoundation for Video I/O"              ON   IF IOS)
 OCV_OPTION(WITH_CARBON         "Use Carbon for UI instead of Cocoa"          OFF  IF APPLE )
-OCV_OPTION(WITH_CUBLAS         "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_CUDA           "Include NVidia Cuda Runtime support"         ON   IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_CUFFT          "Include NVidia Cuda Fast Fourier Transform (FFT) library support"            ON  IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_CUBLAS         "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_NVCUVID        "Include NVidia Video Decoding library support"                               OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS AND NOT APPLE) )
 OCV_OPTION(WITH_EIGEN          "Include Eigen2/Eigen3 support"               ON)
-OCV_OPTION(WITH_FFMPEG         "Include FFMPEG support"                      ON   IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_FFMPEG         "Include FFMPEG support"                      ON   IF (NOT ANDROID AND NOT IOS))
 OCV_OPTION(WITH_GSTREAMER      "Include Gstreamer support"                   ON   IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_GTK            "Include GTK support"                         ON   IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_IPP            "Include Intel IPP support"                   OFF  IF (MSVC OR X86 OR X86_64) )
@@ -139,9 +140,9 @@ OCV_OPTION(WITH_VIDEOINPUT     "Build HighGUI with DirectShow support"       ON
 OCV_OPTION(WITH_XIMEA          "Include XIMEA cameras support"               OFF  IF (NOT ANDROID AND NOT APPLE) )
 OCV_OPTION(WITH_XINE           "Include Xine support (GPL)"                  OFF  IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_CLP            "Include Clp support (EPL)"                   OFF)
-OCV_OPTION(WITH_OPENCL         "Include OpenCL Runtime support"              OFF  IF (NOT ANDROID AND NOT IOS) )
-OCV_OPTION(WITH_OPENCLAMDFFT   "Include AMD OpenCL FFT library support"      OFF  IF (NOT ANDROID AND NOT IOS) )
-OCV_OPTION(WITH_OPENCLAMDBLAS  "Include AMD OpenCL BLAS library support"     OFF  IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_OPENCL         "Include OpenCL Runtime support"              OFF  IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
+OCV_OPTION(WITH_OPENCLAMDFFT   "Include AMD OpenCL FFT library support"      OFF  IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
+OCV_OPTION(WITH_OPENCLAMDBLAS  "Include AMD OpenCL BLAS library support"     OFF  IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
 
 
 # OpenCV build components
@@ -160,12 +161,12 @@ OCV_OPTION(BUILD_ANDROID_SERVICE    "Build OpenCV Manager for Google Play" OFF I
 OCV_OPTION(BUILD_ANDROID_PACKAGE    "Build platform-specific package for Google Play" OFF IF ANDROID )
 
 # 3rd party libs
-OCV_OPTION(BUILD_ZLIB               "Build zlib from source"         WIN32 OR APPLE )
-OCV_OPTION(BUILD_TIFF               "Build libtiff from source"      WIN32 OR ANDROID OR APPLE )
-OCV_OPTION(BUILD_JASPER             "Build libjasper from source"    WIN32 OR ANDROID OR APPLE )
-OCV_OPTION(BUILD_JPEG               "Build libjpeg from source"      WIN32 OR ANDROID OR APPLE )
-OCV_OPTION(BUILD_PNG                "Build libpng from source"       WIN32 OR ANDROID OR APPLE )
-OCV_OPTION(BUILD_OPENEXR            "Build openexr from source"      WIN32 OR ANDROID OR APPLE )
+OCV_OPTION(BUILD_ZLIB               "Build zlib from source"         WIN32 OR APPLE OR CARMA )
+OCV_OPTION(BUILD_TIFF               "Build libtiff from source"      WIN32 OR ANDROID OR APPLE OR CARMA )
+OCV_OPTION(BUILD_JASPER             "Build libjasper from source"    WIN32 OR ANDROID OR APPLE OR CARMA )
+OCV_OPTION(BUILD_JPEG               "Build libjpeg from source"      WIN32 OR ANDROID OR APPLE OR CARMA )
+OCV_OPTION(BUILD_PNG                "Build libpng from source"       WIN32 OR ANDROID OR APPLE OR CARMA )
+OCV_OPTION(BUILD_OPENEXR            "Build openexr from source"      WIN32 OR ANDROID OR APPLE OR CARMA )
 
 
 # OpenCV installation options
diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index c1cd83866b..d6d5f3a98a 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -3,17 +3,17 @@ if(${CMAKE_VERSION} VERSION_LESS "2.8.3")
   return()
 endif()
 
-if (WIN32 AND NOT MSVC)
+if(WIN32 AND NOT MSVC)
   message(STATUS "CUDA compilation is disabled (due to only Visual Studio compiler suppoted on your platform).")
   return()
 endif()
 
-if (CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+if(CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
   message(STATUS "CUDA compilation is disabled (due to Clang unsuppoted on your platform).")
   return()
 endif()
 
-find_package(CUDA 4.1)
+find_package(CUDA 4.2)
 
 if(CUDA_FOUND)
   set(HAVE_CUDA 1)
@@ -26,15 +26,20 @@ if(CUDA_FOUND)
     set(HAVE_CUBLAS 1)
   endif()
 
-  message(STATUS "CUDA detected: " ${CUDA_VERSION})
-
-  if(${CUDA_VERSION_STRING} VERSION_GREATER "4.1")
-    set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-  else()
-    set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+  if(WITH_NVCUVID)
+    find_cuda_helper_libs(nvcuvid)
+    set(HAVE_NVCUVID 1)
   endif()
 
-  set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+  message(STATUS "CUDA detected: " ${CUDA_VERSION})
+
+  if (CARMA)
+    set(CUDA_ARCH_BIN "3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+  else()
+    set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "2.0 3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+  endif()
 
   string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
   string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}")
@@ -72,11 +77,20 @@ if(CUDA_FOUND)
 
   # Tell NVCC to add PTX intermediate code for the specified architectures
   string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_PTX_NO_POINTS}")
-    foreach(ARCH IN LISTS ARCH_LIST)
-      set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=compute_${ARCH})
-      set(OPENCV_CUDA_ARCH_PTX "${OPENCV_CUDA_ARCH_PTX} ${ARCH}")
-      set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
-    endforeach()
+  foreach(ARCH IN LISTS ARCH_LIST)
+    set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=compute_${ARCH})
+    set(OPENCV_CUDA_ARCH_PTX "${OPENCV_CUDA_ARCH_PTX} ${ARCH}")
+    set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
+  endforeach()
+
+  if(CARMA)
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --target-cpu-architecture=ARM" )
+
+    if (CMAKE_VERSION VERSION_LESS 2.8.10)
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -ccbin=${CMAKE_CXX_COMPILER}" )
+    endif()
+
+  endif()
 
   # These vars will be processed in other scripts
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA})
@@ -84,7 +98,7 @@ if(CUDA_FOUND)
 
   message(STATUS "CUDA NVCC target flags: ${CUDA_NVCC_FLAGS}")
 
-  OCV_OPTION(CUDA_FAST_MATH  "Enable --use_fast_math for CUDA compiler " OFF)
+  OCV_OPTION(CUDA_FAST_MATH "Enable --use_fast_math for CUDA compiler " OFF)
 
   if(CUDA_FAST_MATH)
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math)
@@ -92,7 +106,6 @@ if(CUDA_FOUND)
 
   mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD CUDA_SDK_ROOT_DIR)
 
-  unset(CUDA_npp_LIBRARY CACHE)
   find_cuda_helper_libs(npp)
 
   macro(ocv_cuda_compile VAR)
@@ -106,15 +119,15 @@ if(CUDA_FOUND)
       string(REPLACE "-ggdb3" "" ${var} "${${var}}")
     endforeach()
 
-    if (BUILD_SHARED_LIBS)
+    if(BUILD_SHARED_LIBS)
       set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -DCVAPI_EXPORTS)
     endif()
 
     if(UNIX OR APPLE)
-      set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC)
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC)
     endif()
     if(APPLE)
-      set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
     endif()
 
     # disabled because of multiple warnings during building nvcc auto generated files
diff --git a/cmake/templates/cvconfig.h.cmake b/cmake/templates/cvconfig.h.cmake
index 37e092b66e..6d4d2184f9 100644
--- a/cmake/templates/cvconfig.h.cmake
+++ b/cmake/templates/cvconfig.h.cmake
@@ -172,21 +172,15 @@
 /* NVidia Cuda Runtime API*/
 #cmakedefine HAVE_CUDA
 
-/* OpenCL Support */
-#cmakedefine HAVE_OPENCL
-
-/* AMD's OpenCL Fast Fourier Transform Library*/
-#cmakedefine HAVE_CLAMDFFT
-
-/* AMD's Basic Linear Algebra Subprograms Library*/
-#cmakedefine HAVE_CLAMDBLAS
-
 /* NVidia Cuda Fast Fourier Transform (FFT) API*/
 #cmakedefine HAVE_CUFFT
 
 /* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/
 #cmakedefine HAVE_CUBLAS
 
+/* NVidia Video Decoding API*/
+#cmakedefine HAVE_NVCUVID
+
 /* Compile for 'real' NVIDIA GPU architectures */
 #define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}"
 
@@ -199,6 +193,15 @@
 /* Create PTX or BIN for 1.0 compute capability */
 #cmakedefine CUDA_ARCH_BIN_OR_PTX_10
 
+/* OpenCL Support */
+#cmakedefine HAVE_OPENCL
+
+/* AMD's OpenCL Fast Fourier Transform Library*/
+#cmakedefine HAVE_CLAMDFFT
+
+/* AMD's Basic Linear Algebra Subprograms Library*/
+#cmakedefine HAVE_CLAMDBLAS
+
 /* VideoInput library */
 #cmakedefine HAVE_VIDEOINPUT
 
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index cfa14cdcdb..4c5112e3f9 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -10,7 +10,6 @@ if(HAVE_CUDA)
   file(GLOB lib_cuda "src/cuda/*.cu")
   ocv_cuda_compile(cuda_objs ${lib_cuda})
 
-
   set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 else()
   set(lib_cuda "")
diff --git a/modules/core/include/opencv2/core/core.hpp b/modules/core/include/opencv2/core/core.hpp
index 60e2096b29..5b8ee63790 100644
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@@ -91,7 +91,7 @@ class SparseMat;
 typedef Mat MatND;
 
 class GlBuffer;
-class GlTexture;
+class GlTexture2D;
 class GlArrays;
 class GlCamera;
 
@@ -1306,7 +1306,7 @@ public:
         STD_VECTOR_MAT    = 5 << KIND_SHIFT,
         EXPR              = 6 << KIND_SHIFT,
         OPENGL_BUFFER     = 7 << KIND_SHIFT,
-        OPENGL_TEXTURE    = 8 << KIND_SHIFT,
+        OPENGL_TEXTURE2D    = 8 << KIND_SHIFT,
         GPU_MAT           = 9 << KIND_SHIFT
     };
     _InputArray();
@@ -1323,13 +1323,13 @@ public:
     _InputArray(const Scalar& s);
     _InputArray(const double& val);
     _InputArray(const GlBuffer& buf);
-    _InputArray(const GlTexture& tex);
+    _InputArray(const GlTexture2D& tex);
     _InputArray(const gpu::GpuMat& d_mat);
 
     virtual Mat getMat(int i=-1) const;
     virtual void getMatVector(vector<Mat>& mv) const;
     virtual GlBuffer getGlBuffer() const;
-    virtual GlTexture getGlTexture() const;
+    virtual GlTexture2D getGlTexture2D() const;
     virtual gpu::GpuMat getGpuMat() const;
 
     virtual int kind() const;
@@ -1380,6 +1380,8 @@ public:
     template<typename _Tp, int m, int n> _OutputArray(Matx<_Tp, m, n>& matx);
     template<typename _Tp> _OutputArray(_Tp* vec, int n);
     _OutputArray(gpu::GpuMat& d_mat);
+    _OutputArray(GlBuffer& buf);
+    _OutputArray(GlTexture2D& tex);
 
     _OutputArray(const Mat& m);
     template<typename _Tp> _OutputArray(const vector<_Tp>& vec);
@@ -1390,12 +1392,16 @@ public:
     template<typename _Tp, int m, int n> _OutputArray(const Matx<_Tp, m, n>& matx);
     template<typename _Tp> _OutputArray(const _Tp* vec, int n);
     _OutputArray(const gpu::GpuMat& d_mat);
+    _OutputArray(const GlBuffer& buf);
+    _OutputArray(const GlTexture2D& tex);
 
     virtual bool fixedSize() const;
     virtual bool fixedType() const;
     virtual bool needed() const;
     virtual Mat& getMatRef(int i=-1) const;
     virtual gpu::GpuMat& getGpuMatRef() const;
+    virtual GlBuffer& getGlBufferRef() const;
+    virtual GlTexture2D& getGlTexture2DRef() const;
     virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
     virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
     virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
diff --git a/modules/core/include/opencv2/core/cuda_devptrs.hpp b/modules/core/include/opencv2/core/cuda_devptrs.hpp
index 6363e0dc45..26fc2403f9 100644
--- a/modules/core/include/opencv2/core/cuda_devptrs.hpp
+++ b/modules/core/include/opencv2/core/cuda_devptrs.hpp
@@ -152,6 +152,20 @@ namespace cv
 //#undef __CV_GPU_DEPR_BEFORE__
 //#undef __CV_GPU_DEPR_AFTER__
 
+        namespace device
+        {
+            using cv::gpu::PtrSz;
+            using cv::gpu::PtrStep;
+            using cv::gpu::PtrStepSz;
+
+            using cv::gpu::PtrStepSzb;
+            using cv::gpu::PtrStepSzf;
+            using cv::gpu::PtrStepSzi;
+
+            using cv::gpu::PtrStepb;
+            using cv::gpu::PtrStepf;
+            using cv::gpu::PtrStepi;
+        }
     }
 }
 
diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp
index 2830a9e949..6bf4e5d21b 100644
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -79,6 +79,8 @@ namespace cv { namespace gpu
         WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
     };
 
+    CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
+
     // Gives information about what GPU archs this OpenCV GPU module was
     // compiled for
     class CV_EXPORTS TargetArchs
@@ -545,22 +547,6 @@ namespace cv { namespace gpu
         ensureSizeIsEnough(size.height, size.width, type, m);
     }
 
-    inline void createContinuous(int rows, int cols, int type, GpuMat& m)
-    {
-        int area = rows * cols;
-        if (!m.isContinuous() || m.type() != type || m.size().area() != area)
-            ensureSizeIsEnough(1, area, type, m);
-        m = m.reshape(0, rows);
-    }
-
-    inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
-    {
-        if (m.type() == type && m.rows >= rows && m.cols >= cols)
-            m = m(Rect(0, 0, cols, rows));
-        else
-            m.create(rows, cols, type);
-    }
-
     inline GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat &mat)
     {
         if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
diff --git a/modules/core/include/opencv2/core/internal.hpp b/modules/core/include/opencv2/core/internal.hpp
index 93e56c3ab3..c042ccaf1f 100644
--- a/modules/core/include/opencv2/core/internal.hpp
+++ b/modules/core/include/opencv2/core/internal.hpp
@@ -750,39 +750,4 @@ typedef struct CvBigFuncTable
     (tab).fn_2d[CV_32F] = (void*)FUNCNAME##_32f##FLAG;  \
     (tab).fn_2d[CV_64F] = (void*)FUNCNAME##_64f##FLAG
 
-#ifdef __cplusplus
-//! OpenGL extension table
-class CV_EXPORTS CvOpenGlFuncTab
-{
-public:
-    virtual ~CvOpenGlFuncTab();
-
-    virtual void genBuffers(int n, unsigned int* buffers) const = 0;
-    virtual void deleteBuffers(int n, const unsigned int* buffers) const = 0;
-
-    virtual void bufferData(unsigned int target, ptrdiff_t size, const void* data, unsigned int usage) const = 0;
-    virtual void bufferSubData(unsigned int target, ptrdiff_t offset, ptrdiff_t size, const void* data) const = 0;
-
-    virtual void bindBuffer(unsigned int target, unsigned int buffer) const = 0;
-
-    virtual void* mapBuffer(unsigned int target, unsigned int access) const = 0;
-    virtual void unmapBuffer(unsigned int target) const = 0;
-
-    virtual void generateBitmapFont(const std::string& family, int height, int weight, bool italic, bool underline, int start, int count, int base) const = 0;
-
-    virtual bool isGlContextInitialized() const = 0;
-};
-
-CV_EXPORTS void icvSetOpenGlFuncTab(const CvOpenGlFuncTab* tab);
-
-CV_EXPORTS bool icvCheckGlError(const char* file, const int line, const char* func = "");
-
-#if defined(__GNUC__)
-    #define CV_CheckGlError() CV_DbgAssert( (::icvCheckGlError(__FILE__, __LINE__, __func__)) )
-#else
-    #define CV_CheckGlError() CV_DbgAssert( (::icvCheckGlError(__FILE__, __LINE__)) )
-#endif
-
-#endif //__cplusplus
-
 #endif // __OPENCV_CORE_INTERNAL_HPP__
diff --git a/modules/core/include/opencv2/core/opengl_interop.hpp b/modules/core/include/opencv2/core/opengl_interop.hpp
index 0bd2e9fdcf..cfa84756c5 100644
--- a/modules/core/include/opencv2/core/opengl_interop.hpp
+++ b/modules/core/include/opencv2/core/opengl_interop.hpp
@@ -47,205 +47,212 @@
 
 #include "opencv2/core/core.hpp"
 
-namespace cv
-{
+namespace cv {
+
+CV_EXPORTS bool checkGlError(const char* file, const int line, const char* func = "");
+
+#if defined(__GNUC__)
+    #define CV_CheckGlError() CV_DbgAssert( (cv::checkGlError(__FILE__, __LINE__, __func__)) )
+#else
+    #define CV_CheckGlError() CV_DbgAssert( (cv::checkGlError(__FILE__, __LINE__)) )
+#endif
+
+/////////////////// OpenGL Objects ///////////////////
+
 //! Smart pointer for OpenGL buffer memory with reference counting.
 class CV_EXPORTS GlBuffer
 {
 public:
-    enum Usage
+    enum Target
     {
-        ARRAY_BUFFER = 0x8892,  // buffer will use for OpenGL arrays (vertices, colors, normals, etc)
-        TEXTURE_BUFFER = 0x88EC // buffer will ise for OpenGL textures
+        ARRAY_BUFFER         = 0x8892, //!< The buffer will be used as a source for vertex data
+        ELEMENT_ARRAY_BUFFER = 0x8893, //!< The buffer will be used for indices (in glDrawElements, for example)
+        PIXEL_PACK_BUFFER    = 0x88EB, //!< The buffer will be used for reading from OpenGL textures
+        PIXEL_UNPACK_BUFFER  = 0x88EC  //!< The buffer will be used for writing to OpenGL textures
+    };
+
+    enum Access
+    {
+        READ_ONLY  = 0x88B8,
+        WRITE_ONLY = 0x88B9,
+        READ_WRITE = 0x88BA
     };
 
     //! create empty buffer
-    explicit GlBuffer(Usage usage);
+    GlBuffer();
+
+    //! create buffer from existed buffer id
+    GlBuffer(int arows, int acols, int atype, unsigned int abufId, bool autoRelease = false);
+    GlBuffer(Size asize, int atype, unsigned int abufId, bool autoRelease = false);
 
     //! create buffer
-    GlBuffer(int rows, int cols, int type, Usage usage);
-    GlBuffer(Size size, int type, Usage usage);
+    GlBuffer(int arows, int acols, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
+    GlBuffer(Size asize, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
 
     //! copy from host/device memory
-    GlBuffer(InputArray mat, Usage usage);
+    explicit GlBuffer(InputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false);
 
-    void create(int rows, int cols, int type, Usage usage);
-    void create(Size size, int type, Usage usage);
-    void create(int rows, int cols, int type);
-    void create(Size size, int type);
+    //! create buffer
+    void create(int arows, int acols, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
+    void create(Size asize, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false) { create(asize.height, asize.width, atype, target, autoRelease); }
 
+    //! release memory and delete buffer object
     void release();
 
-    //! copy from host/device memory
-    void copyFrom(InputArray mat);
+    //! set auto release mode (if true, release will be called in object's destructor)
+    void setAutoRelease(bool flag);
 
-    void bind() const;
-    void unbind() const;
+    //! copy from host/device memory
+    void copyFrom(InputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    //! copy to host/device memory
+    void copyTo(OutputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false) const;
+
+    //! create copy of current buffer
+    GlBuffer clone(Target target = ARRAY_BUFFER, bool autoRelease = false) const;
+
+    //! bind buffer for specified target
+    void bind(Target target) const;
+
+    //! unbind any buffers from specified target
+    static void unbind(Target target);
 
     //! map to host memory
-    Mat mapHost();
+    Mat mapHost(Access access);
     void unmapHost();
 
     //! map to device memory
     gpu::GpuMat mapDevice();
     void unmapDevice();
 
-    inline int rows() const { return rows_; }
-    inline int cols() const { return cols_; }
-    inline Size size() const { return Size(cols_, rows_); }
-    inline bool empty() const { return rows_ == 0 || cols_ == 0; }
+    int rows() const { return rows_; }
+    int cols() const { return cols_; }
+    Size size() const { return Size(cols_, rows_); }
+    bool empty() const { return rows_ == 0 || cols_ == 0; }
 
-    inline int type() const { return type_; }
-    inline int depth() const { return CV_MAT_DEPTH(type_); }
-    inline int channels() const { return CV_MAT_CN(type_); }
-    inline int elemSize() const { return CV_ELEM_SIZE(type_); }
-    inline int elemSize1() const { return CV_ELEM_SIZE1(type_); }
+    int type() const { return type_; }
+    int depth() const { return CV_MAT_DEPTH(type_); }
+    int channels() const { return CV_MAT_CN(type_); }
+    int elemSize() const { return CV_ELEM_SIZE(type_); }
+    int elemSize1() const { return CV_ELEM_SIZE1(type_); }
 
-    inline Usage usage() const { return usage_; }
+    unsigned int bufId() const;
 
     class Impl;
+
 private:
+    Ptr<Impl> impl_;
     int rows_;
     int cols_;
     int type_;
-    Usage usage_;
-
-    Ptr<Impl> impl_;
 };
 
 template <> CV_EXPORTS void Ptr<GlBuffer::Impl>::delete_obj();
 
-//! Smart pointer for OpenGL 2d texture memory with reference counting.
-class CV_EXPORTS GlTexture
+//! Smart pointer for OpenGL 2D texture memory with reference counting.
+class CV_EXPORTS GlTexture2D
 {
 public:
+    enum Format
+    {
+        NONE            = 0,
+        DEPTH_COMPONENT = 0x1902, //!< Depth
+        RGB             = 0x1907, //!< Red, Green, Blue
+        RGBA            = 0x1908  //!< Red, Green, Blue, Alpha
+    };
+
     //! create empty texture
-    GlTexture();
+    GlTexture2D();
+
+    //! create texture from existed texture id
+    GlTexture2D(int arows, int acols, Format aformat, unsigned int atexId, bool autoRelease = false);
+    GlTexture2D(Size asize, Format aformat, unsigned int atexId, bool autoRelease = false);
 
     //! create texture
-    GlTexture(int rows, int cols, int type);
-    GlTexture(Size size, int type);
+    GlTexture2D(int arows, int acols, Format aformat, bool autoRelease = false);
+    GlTexture2D(Size asize, Format aformat, bool autoRelease = false);
 
     //! copy from host/device memory
-    explicit GlTexture(InputArray mat, bool bgra = true);
+    explicit GlTexture2D(InputArray arr, bool autoRelease = false);
 
-    void create(int rows, int cols, int type);
-    void create(Size size, int type);
+    //! create texture
+    void create(int arows, int acols, Format aformat, bool autoRelease = false);
+    void create(Size asize, Format aformat, bool autoRelease = false) { create(asize.height, asize.width, aformat, autoRelease); }
+
+    //! release memory and delete texture object
     void release();
 
+    //! set auto release mode (if true, release will be called in object's destructor)
+    void setAutoRelease(bool flag);
+
     //! copy from host/device memory
-    void copyFrom(InputArray mat, bool bgra = true);
+    void copyFrom(InputArray arr, bool autoRelease = false);
 
+    //! copy to host/device memory
+    void copyTo(OutputArray arr, int ddepth = CV_32F, bool autoRelease = false) const;
+
+    //! bind texture to current active texture unit for GL_TEXTURE_2D target
     void bind() const;
-    void unbind() const;
 
-    inline int rows() const { return rows_; }
-    inline int cols() const { return cols_; }
-    inline Size size() const { return Size(cols_, rows_); }
-    inline bool empty() const { return rows_ == 0 || cols_ == 0; }
+    int rows() const { return rows_; }
+    int cols() const { return cols_; }
+    Size size() const { return Size(cols_, rows_); }
+    bool empty() const { return rows_ == 0 || cols_ == 0; }
 
-    inline int type() const { return type_; }
-    inline int depth() const { return CV_MAT_DEPTH(type_); }
-    inline int channels() const { return CV_MAT_CN(type_); }
-    inline int elemSize() const { return CV_ELEM_SIZE(type_); }
-    inline int elemSize1() const { return CV_ELEM_SIZE1(type_); }
+    Format format() const { return format_; }
+
+    unsigned int texId() const;
 
     class Impl;
+
 private:
+    Ptr<Impl> impl_;
     int rows_;
     int cols_;
-    int type_;
-
-    Ptr<Impl> impl_;
-    GlBuffer buf_;
+    Format format_;
 };
 
-template <> CV_EXPORTS void Ptr<GlTexture::Impl>::delete_obj();
+template <> CV_EXPORTS void Ptr<GlTexture2D::Impl>::delete_obj();
 
 //! OpenGL Arrays
 class CV_EXPORTS GlArrays
 {
 public:
-    inline GlArrays()
-        : vertex_(GlBuffer::ARRAY_BUFFER), color_(GlBuffer::ARRAY_BUFFER), bgra_(true), normal_(GlBuffer::ARRAY_BUFFER), texCoord_(GlBuffer::ARRAY_BUFFER)
-    {
-    }
+    GlArrays();
 
     void setVertexArray(InputArray vertex);
-    inline void resetVertexArray() { vertex_.release(); }
+    void resetVertexArray();
 
-    void setColorArray(InputArray color, bool bgra = true);
-    inline void resetColorArray() { color_.release(); }
+    void setColorArray(InputArray color);
+    void resetColorArray();
 
     void setNormalArray(InputArray normal);
-    inline void resetNormalArray() { normal_.release(); }
+    void resetNormalArray();
 
     void setTexCoordArray(InputArray texCoord);
-    inline void resetTexCoordArray() { texCoord_.release(); }
+    void resetTexCoordArray();
+
+    void release();
+
+    void setAutoRelease(bool flag);
 
     void bind() const;
-    void unbind() const;
 
-    inline int rows() const { return vertex_.rows(); }
-    inline int cols() const { return vertex_.cols(); }
-    inline Size size() const { return vertex_.size(); }
-    inline bool empty() const { return vertex_.empty(); }
+    int size() const { return size_; }
+    bool empty() const { return size_ == 0; }
 
 private:
+    int size_;
     GlBuffer vertex_;
     GlBuffer color_;
-    bool bgra_;
     GlBuffer normal_;
     GlBuffer texCoord_;
 };
 
-//! OpenGL Font
-class CV_EXPORTS GlFont
-{
-public:
-    enum Weight
-    {
-        WEIGHT_LIGHT    = 300,
-        WEIGHT_NORMAL   = 400,
-        WEIGHT_SEMIBOLD = 600,
-        WEIGHT_BOLD     = 700,
-        WEIGHT_BLACK    = 900
-    };
-
-    enum Style
-    {
-        STYLE_NORMAL    = 0,
-        STYLE_ITALIC    = 1,
-        STYLE_UNDERLINE = 2
-    };
-
-    static Ptr<GlFont> get(const std::string& family, int height = 12, Weight weight = WEIGHT_NORMAL, Style style = STYLE_NORMAL);
-
-    void draw(const char* str, size_t len) const;
-
-    inline const std::string& family() const { return family_; }
-    inline int height() const { return height_; }
-    inline Weight weight() const { return weight_; }
-    inline Style style() const { return style_; }
-
-private:
-    GlFont(const std::string& family, int height, Weight weight, Style style);
-
-    std::string family_;
-    int height_;
-    Weight weight_;
-    Style style_;
-
-    unsigned int base_;
-
-    GlFont(const GlFont&);
-    GlFont& operator =(const GlFont&);
-};
-
-//! render functions
+/////////////////// Render Functions ///////////////////
 
 //! render texture rectangle in window
-CV_EXPORTS void render(const GlTexture& tex,
+CV_EXPORTS void render(const GlTexture2D& tex,
     Rect_<double> wndRect = Rect_<double>(0.0, 0.0, 1.0, 1.0),
     Rect_<double> texRect = Rect_<double>(0.0, 0.0, 1.0, 1.0));
 
@@ -267,67 +274,13 @@ namespace RenderMode {
 
 //! render OpenGL arrays
 CV_EXPORTS void render(const GlArrays& arr, int mode = RenderMode::POINTS, Scalar color = Scalar::all(255));
+CV_EXPORTS void render(const GlArrays& arr, InputArray indices, int mode = RenderMode::POINTS, Scalar color = Scalar::all(255));
 
-CV_EXPORTS void render(const std::string& str, const Ptr<GlFont>& font, Scalar color, Point2d pos);
-
-//! OpenGL camera
-class CV_EXPORTS GlCamera
-{
-public:
-    GlCamera();
-
-    void lookAt(Point3d eye, Point3d center, Point3d up);
-    void setCameraPos(Point3d pos, double yaw, double pitch, double roll);
-
-    void setScale(Point3d scale);
-
-    void setProjectionMatrix(const Mat& projectionMatrix, bool transpose = true);
-    void setPerspectiveProjection(double fov, double aspect, double zNear, double zFar);
-    void setOrthoProjection(double left, double right, double bottom, double top, double zNear, double zFar);
-
-    void setupProjectionMatrix() const;
-    void setupModelViewMatrix() const;
-
-private:
-    Point3d eye_;
-    Point3d center_;
-    Point3d up_;
-
-    Point3d pos_;
-    double yaw_;
-    double pitch_;
-    double roll_;
-
-    bool useLookAtParams_;
-
-    Point3d scale_;
-
-    Mat projectionMatrix_;
-
-    double fov_;
-    double aspect_;
-
-    double left_;
-    double right_;
-    double bottom_;
-    double top_;
-
-    double zNear_;
-    double zFar_;
-
-    bool perspectiveProjection_;
-};
-
-inline void GlBuffer::create(Size _size, int _type, Usage _usage) { create(_size.height, _size.width, _type, _usage); }
-inline void GlBuffer::create(int _rows, int _cols, int _type) { create(_rows, _cols, _type, usage()); }
-inline void GlBuffer::create(Size _size, int _type) { create(_size.height, _size.width, _type, usage()); }
-inline void GlTexture::create(Size _size, int _type) { create(_size.height, _size.width, _type); }
-
-namespace gpu
-{
+namespace gpu {
     //! set a CUDA device to use OpenGL interoperability
     CV_EXPORTS void setGlDevice(int device = 0);
 }
+
 } // namespace cv
 
 #endif // __cplusplus
diff --git a/modules/core/src/cuda/matrix_operations.cu b/modules/core/src/cuda/matrix_operations.cu
index 9e830e563b..60aa073406 100644
--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
@@ -44,6 +44,7 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/type_traits.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
@@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device
     void writeScalar(const int*);
     void writeScalar(const float*);
     void writeScalar(const double*);
+    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
     void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
 }}}
 
@@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device
     //////////////////////////////// ConvertTo ////////////////////////////////
     ///////////////////////////////////////////////////////////////////////////
 
-    template <typename T, typename D> struct Convertor : unary_function<T, D>
+    template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
     {
-        Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
+        Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {}
 
-        __device__ __forceinline__ D operator()(const T& src) const
+        __device__ __forceinline__ D operator()(typename TypeTraits<T>::ParameterType src) const
         {
             return saturate_cast<D>(alpha * src + beta);
         }
 
-        double alpha, beta;
+        S alpha, beta;
     };
 
     namespace detail
@@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device
         };
     }
 
-    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
+    template <typename T, typename D, typename S> struct TransformFunctorTraits< Convertor<T, D, S> > : detail::ConvertTraits< Convertor<T, D, S> >
     {
     };
 
-    template<typename T, typename D>
+    template<typename T, typename D, typename S>
     void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
     {
         cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
         cudaSafeCall( cudaSetDoubleForDevice(&beta) );
-        Convertor<T, D> op(alpha, beta);
+        Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
         cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
     }
 
@@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device
     {
         typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);
 
-        static const caller_t tab[8][8] =
+        static const caller_t tab[7][7] =
         {
-            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
-            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
-
-            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
-            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
-
-            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
-            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
-
-            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
-            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
-
-            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
-            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
-
-            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
-            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
-
-            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
-            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
-
-            {0,0,0,0,0,0,0,0}
+            {
+                cvt_<uchar, uchar, float>,
+                cvt_<uchar, schar, float>,
+                cvt_<uchar, ushort, float>,
+                cvt_<uchar, short, float>,
+                cvt_<uchar, int, float>,
+                cvt_<uchar, float, float>,
+                cvt_<uchar, double, double>
+            },
+            {
+                cvt_<schar, uchar, float>,
+                cvt_<schar, schar, float>,
+                cvt_<schar, ushort, float>,
+                cvt_<schar, short, float>,
+                cvt_<schar, int, float>,
+                cvt_<schar, float, float>,
+                cvt_<schar, double, double>
+            },
+            {
+                cvt_<ushort, uchar, float>,
+                cvt_<ushort, schar, float>,
+                cvt_<ushort, ushort, float>,
+                cvt_<ushort, short, float>,
+                cvt_<ushort, int, float>,
+                cvt_<ushort, float, float>,
+                cvt_<ushort, double, double>
+            },
+            {
+                cvt_<short, uchar, float>,
+                cvt_<short, schar, float>,
+                cvt_<short, ushort, float>,
+                cvt_<short, short, float>,
+                cvt_<short, int, float>,
+                cvt_<short, float, float>,
+                cvt_<short, double, double>
+            },
+            {
+                cvt_<int, uchar, float>,
+                cvt_<int, schar, float>,
+                cvt_<int, ushort, float>,
+                cvt_<int, short, float>,
+                cvt_<int, int, double>,
+                cvt_<int, float, double>,
+                cvt_<int, double, double>
+            },
+            {
+                cvt_<float, uchar, float>,
+                cvt_<float, schar, float>,
+                cvt_<float, ushort, float>,
+                cvt_<float, short, float>,
+                cvt_<float, int, float>,
+                cvt_<float, float, float>,
+                cvt_<float, double, double>
+            },
+            {
+                cvt_<double, uchar, double>,
+                cvt_<double, schar, double>,
+                cvt_<double, ushort, double>,
+                cvt_<double, short, double>,
+                cvt_<double, int, double>,
+                cvt_<double, float, double>,
+                cvt_<double, double, double>
+            }
         };
 
         caller_t func = tab[sdepth][ddepth];
-        if (!func)
-            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu");
-
         func(src, dst, alpha, beta, stream);
     }
 
diff --git a/modules/core/src/gl_core_3_1.cpp b/modules/core/src/gl_core_3_1.cpp
new file mode 100644
index 0000000000..3bc74faa19
--- /dev/null
+++ b/modules/core/src/gl_core_3_1.cpp
@@ -0,0 +1,2718 @@
+#include <string>
+#include <sstream>
+#include "cvconfig.h"
+#include "opencv2/core/core.hpp"
+#include "gl_core_3_1.hpp"
+
+#ifdef HAVE_OPENGL
+    #if defined(__APPLE__)
+        #include <mach-o/dyld.h>
+
+        static void* AppleGLGetProcAddress (const char* name)
+        {
+            static const struct mach_header* image = 0;
+            if (!image)
+                image = NSAddImage("/System/Library/Frameworks/OpenGL.framework/Versions/Current/OpenGL", NSADDIMAGE_OPTION_RETURN_ON_ERROR);
+
+            // prepend a '_' for the Unix C symbol mangling convention
+            std::string symbolName = "_";
+            symbolName += std::string(name);
+
+            NSSymbol symbol = image ? NSLookupSymbolInImage(image, &symbolName[0], NSLOOKUPSYMBOLINIMAGE_OPTION_BIND | NSLOOKUPSYMBOLINIMAGE_OPTION_RETURN_ON_ERROR) : 0;
+
+            return symbol ? NSAddressOfSymbol(symbol) : 0;
+        }
+    #endif // __APPLE__
+
+    #if defined(__sgi) || defined (__sun)
+        #include <dlfcn.h>
+        #include <stdio.h>
+
+        static void* SunGetProcAddress (const char* name)
+        {
+            typedef void* (func_t*)(const GLubyte*);
+
+            static void* h = 0;
+            static func_t gpa = 0;
+
+            if (!h)
+            {
+                h = dlopen(NULL, RTLD_LAZY | RTLD_LOCAL);
+                if (!h)
+                    return 0;
+                gpa = (func_t) dlsym(h, "glXGetProcAddress");
+            }
+
+            return gpa ? gpa((const GLubyte*) name) : dlsym(h, name);
+        }
+    #endif // __sgi || __sun
+
+    #if defined(_WIN32)
+        #ifdef _MSC_VER
+            #pragma warning(disable: 4055)
+            #pragma warning(disable: 4054)
+        #endif
+
+        static int TestPointer(const PROC pTest)
+        {
+            if(!pTest)
+                return 0;
+
+            ptrdiff_t iTest = (ptrdiff_t) pTest;
+
+            if (iTest == 1 || iTest == 2 || iTest == 3 || iTest == -1)
+                return 0;
+
+            return 1;
+        }
+
+        static PROC WinGetProcAddress(const char* name)
+        {
+            PROC pFunc = wglGetProcAddress((LPCSTR) name);
+            if (TestPointer(pFunc))
+                return pFunc;
+
+            HMODULE glMod = GetModuleHandleA("OpenGL32.dll");
+            return (PROC) GetProcAddress(glMod, (LPCSTR) name);
+        }
+    #endif // _WIN32
+
+    #if defined(_WIN32)
+        #define CV_GL_GET_PROC_ADDRESS(name) WinGetProcAddress(name)
+    #elif defined(__APPLE__)
+        #define CV_GL_GET_PROC_ADDRESS(name) AppleGLGetProcAddress(name)
+    #elif defined(__sgi) || defined(__sun)
+        #define CV_GL_GET_PROC_ADDRESS(name) SunGetProcAddress(name)
+    #else // GLX
+        #include <GL/glx.h>
+
+        #define CV_GL_GET_PROC_ADDRESS(name) glXGetProcAddressARB((const GLubyte*) name)
+    #endif
+
+    static void* IntGetProcAddress(const char* name)
+    {
+        void* func =  (void*) CV_GL_GET_PROC_ADDRESS(name);
+        if (!func)
+        {
+            std::ostringstream msg;
+            msg << "Can't load OpenGL extension [" << name << "]";
+            CV_Error(CV_OpenGlApiCallError, msg.str());
+        }
+        return func;
+    }
+#else
+    static void* IntGetProcAddress(const char*)
+    {
+        CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support");
+        return 0;
+    }
+#endif
+
+namespace gl
+{
+    //////////////////////////////////////////////
+    // Function pointer types
+
+    // Extension: 1.1
+    typedef void (CODEGEN_FUNCPTR *PFNCULLFACEPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNFRONTFACEPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNHINTPROC)(GLenum , GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNLINEWIDTHPROC)(GLfloat );
+    typedef void (CODEGEN_FUNCPTR *PFNPOINTSIZEPROC)(GLfloat );
+    typedef void (CODEGEN_FUNCPTR *PFNPOLYGONMODEPROC)(GLenum , GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNSCISSORPROC)(GLint , GLint , GLsizei , GLsizei );
+    typedef void (CODEGEN_FUNCPTR *PFNTEXPARAMETERFPROC)(GLenum , GLenum , GLfloat );
+    typedef void (CODEGEN_FUNCPTR *PFNTEXPARAMETERFVPROC)(GLenum , GLenum , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNTEXPARAMETERIPROC)(GLenum , GLenum , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNTEXPARAMETERIVPROC)(GLenum , GLenum , const GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNTEXIMAGE1DPROC)(GLenum , GLint , GLint , GLsizei , GLint , GLenum , GLenum , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNTEXIMAGE2DPROC)(GLenum , GLint , GLint , GLsizei , GLsizei , GLint , GLenum , GLenum , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNDRAWBUFFERPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNCLEARPROC)(GLbitfield );
+    typedef void (CODEGEN_FUNCPTR *PFNCLEARCOLORPROC)(GLfloat , GLfloat , GLfloat , GLfloat );
+    typedef void (CODEGEN_FUNCPTR *PFNCLEARSTENCILPROC)(GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNCLEARDEPTHPROC)(GLdouble );
+    typedef void (CODEGEN_FUNCPTR *PFNSTENCILMASKPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNCOLORMASKPROC)(GLboolean , GLboolean , GLboolean , GLboolean );
+    typedef void (CODEGEN_FUNCPTR *PFNDEPTHMASKPROC)(GLboolean );
+    typedef void (CODEGEN_FUNCPTR *PFNDISABLEPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNENABLEPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNFINISHPROC)();
+    typedef void (CODEGEN_FUNCPTR *PFNFLUSHPROC)();
+    typedef void (CODEGEN_FUNCPTR *PFNBLENDFUNCPROC)(GLenum , GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNLOGICOPPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNSTENCILFUNCPROC)(GLenum , GLint , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNSTENCILOPPROC)(GLenum , GLenum , GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNDEPTHFUNCPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNPIXELSTOREFPROC)(GLenum , GLfloat );
+    typedef void (CODEGEN_FUNCPTR *PFNPIXELSTOREIPROC)(GLenum , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNREADBUFFERPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNREADPIXELSPROC)(GLint , GLint , GLsizei , GLsizei , GLenum , GLenum , GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETBOOLEANVPROC)(GLenum , GLboolean *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETDOUBLEVPROC)(GLenum , GLdouble *);
+    typedef GLenum (CODEGEN_FUNCPTR *PFNGETERRORPROC)();
+    typedef void (CODEGEN_FUNCPTR *PFNGETFLOATVPROC)(GLenum , GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETINTEGERVPROC)(GLenum , GLint *);
+    typedef const GLubyte * (CODEGEN_FUNCPTR *PFNGETSTRINGPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNGETTEXIMAGEPROC)(GLenum , GLint , GLenum , GLenum , GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETTEXPARAMETERFVPROC)(GLenum , GLenum , GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETTEXPARAMETERIVPROC)(GLenum , GLenum , GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETTEXLEVELPARAMETERFVPROC)(GLenum , GLint , GLenum , GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETTEXLEVELPARAMETERIVPROC)(GLenum , GLint , GLenum , GLint *);
+    typedef GLboolean (CODEGEN_FUNCPTR *PFNISENABLEDPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNDEPTHRANGEPROC)(GLdouble , GLdouble );
+    typedef void (CODEGEN_FUNCPTR *PFNVIEWPORTPROC)(GLint , GLint , GLsizei , GLsizei );
+    typedef void (CODEGEN_FUNCPTR *PFNDRAWARRAYSPROC)(GLenum , GLint , GLsizei );
+    typedef void (CODEGEN_FUNCPTR *PFNDRAWELEMENTSPROC)(GLenum , GLsizei , GLenum , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETPOINTERVPROC)(GLenum , GLvoid* *);
+    typedef void (CODEGEN_FUNCPTR *PFNPOLYGONOFFSETPROC)(GLfloat , GLfloat );
+    typedef void (CODEGEN_FUNCPTR *PFNCOPYTEXIMAGE1DPROC)(GLenum , GLint , GLenum , GLint , GLint , GLsizei , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNCOPYTEXIMAGE2DPROC)(GLenum , GLint , GLenum , GLint , GLint , GLsizei , GLsizei , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNCOPYTEXSUBIMAGE1DPROC)(GLenum , GLint , GLint , GLint , GLint , GLsizei );
+    typedef void (CODEGEN_FUNCPTR *PFNCOPYTEXSUBIMAGE2DPROC)(GLenum , GLint , GLint , GLint , GLint , GLint , GLsizei , GLsizei );
+    typedef void (CODEGEN_FUNCPTR *PFNTEXSUBIMAGE1DPROC)(GLenum , GLint , GLint , GLsizei , GLenum , GLenum , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNTEXSUBIMAGE2DPROC)(GLenum , GLint , GLint , GLint , GLsizei , GLsizei , GLenum , GLenum , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNBINDTEXTUREPROC)(GLenum , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNDELETETEXTURESPROC)(GLsizei , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGENTEXTURESPROC)(GLsizei , GLuint *);
+    typedef GLboolean (CODEGEN_FUNCPTR *PFNISTEXTUREPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNINDEXUBPROC)(GLubyte );
+    typedef void (CODEGEN_FUNCPTR *PFNINDEXUBVPROC)(const GLubyte *);
+
+    // Extension: 1.2
+    typedef void (CODEGEN_FUNCPTR *PFNBLENDCOLORPROC)(GLfloat , GLfloat , GLfloat , GLfloat );
+    typedef void (CODEGEN_FUNCPTR *PFNBLENDEQUATIONPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNDRAWRANGEELEMENTSPROC)(GLenum , GLuint , GLuint , GLsizei , GLenum , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNTEXSUBIMAGE3DPROC)(GLenum , GLint , GLint , GLint , GLint , GLsizei , GLsizei , GLsizei , GLenum , GLenum , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNCOPYTEXSUBIMAGE3DPROC)(GLenum , GLint , GLint , GLint , GLint , GLint , GLint , GLsizei , GLsizei );
+
+    // Extension: 1.3
+    typedef void (CODEGEN_FUNCPTR *PFNACTIVETEXTUREPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNSAMPLECOVERAGEPROC)(GLfloat , GLboolean );
+    typedef void (CODEGEN_FUNCPTR *PFNCOMPRESSEDTEXIMAGE3DPROC)(GLenum , GLint , GLenum , GLsizei , GLsizei , GLsizei , GLint , GLsizei , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNCOMPRESSEDTEXIMAGE2DPROC)(GLenum , GLint , GLenum , GLsizei , GLsizei , GLint , GLsizei , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNCOMPRESSEDTEXIMAGE1DPROC)(GLenum , GLint , GLenum , GLsizei , GLint , GLsizei , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNCOMPRESSEDTEXSUBIMAGE3DPROC)(GLenum , GLint , GLint , GLint , GLint , GLsizei , GLsizei , GLsizei , GLenum , GLsizei , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNCOMPRESSEDTEXSUBIMAGE2DPROC)(GLenum , GLint , GLint , GLint , GLsizei , GLsizei , GLenum , GLsizei , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNCOMPRESSEDTEXSUBIMAGE1DPROC)(GLenum , GLint , GLint , GLsizei , GLenum , GLsizei , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETCOMPRESSEDTEXIMAGEPROC)(GLenum , GLint , GLvoid *);
+
+    // Extension: 1.4
+    typedef void (CODEGEN_FUNCPTR *PFNBLENDFUNCSEPARATEPROC)(GLenum , GLenum , GLenum , GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNMULTIDRAWARRAYSPROC)(GLenum , const GLint *, const GLsizei *, GLsizei );
+    typedef void (CODEGEN_FUNCPTR *PFNMULTIDRAWELEMENTSPROC)(GLenum , const GLsizei *, GLenum , const GLvoid* const *, GLsizei );
+    typedef void (CODEGEN_FUNCPTR *PFNPOINTPARAMETERFPROC)(GLenum , GLfloat );
+    typedef void (CODEGEN_FUNCPTR *PFNPOINTPARAMETERFVPROC)(GLenum , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNPOINTPARAMETERIPROC)(GLenum , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNPOINTPARAMETERIVPROC)(GLenum , const GLint *);
+
+    // Extension: 1.5
+    typedef void (CODEGEN_FUNCPTR *PFNGENQUERIESPROC)(GLsizei , GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNDELETEQUERIESPROC)(GLsizei , const GLuint *);
+    typedef GLboolean (CODEGEN_FUNCPTR *PFNISQUERYPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNBEGINQUERYPROC)(GLenum , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNENDQUERYPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNGETQUERYIVPROC)(GLenum , GLenum , GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETQUERYOBJECTIVPROC)(GLuint , GLenum , GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETQUERYOBJECTUIVPROC)(GLuint , GLenum , GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNBINDBUFFERPROC)(GLenum , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNDELETEBUFFERSPROC)(GLsizei , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGENBUFFERSPROC)(GLsizei , GLuint *);
+    typedef GLboolean (CODEGEN_FUNCPTR *PFNISBUFFERPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNBUFFERDATAPROC)(GLenum , GLsizeiptr , const GLvoid *, GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNBUFFERSUBDATAPROC)(GLenum , GLintptr , GLsizeiptr , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETBUFFERSUBDATAPROC)(GLenum , GLintptr , GLsizeiptr , GLvoid *);
+    typedef GLvoid* (CODEGEN_FUNCPTR *PFNMAPBUFFERPROC)(GLenum , GLenum );
+    typedef GLboolean (CODEGEN_FUNCPTR *PFNUNMAPBUFFERPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNGETBUFFERPARAMETERIVPROC)(GLenum , GLenum , GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETBUFFERPOINTERVPROC)(GLenum , GLenum , GLvoid* *);
+
+    // Extension: 2.0
+    typedef void (CODEGEN_FUNCPTR *PFNBLENDEQUATIONSEPARATEPROC)(GLenum , GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNDRAWBUFFERSPROC)(GLsizei , const GLenum *);
+    typedef void (CODEGEN_FUNCPTR *PFNSTENCILOPSEPARATEPROC)(GLenum , GLenum , GLenum , GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNSTENCILFUNCSEPARATEPROC)(GLenum , GLenum , GLint , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNSTENCILMASKSEPARATEPROC)(GLenum , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNATTACHSHADERPROC)(GLuint , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNBINDATTRIBLOCATIONPROC)(GLuint , GLuint , const GLchar *);
+    typedef void (CODEGEN_FUNCPTR *PFNCOMPILESHADERPROC)(GLuint );
+    typedef GLuint (CODEGEN_FUNCPTR *PFNCREATEPROGRAMPROC)();
+    typedef GLuint (CODEGEN_FUNCPTR *PFNCREATESHADERPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNDELETEPROGRAMPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNDELETESHADERPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNDETACHSHADERPROC)(GLuint , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNDISABLEVERTEXATTRIBARRAYPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNENABLEVERTEXATTRIBARRAYPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNGETACTIVEATTRIBPROC)(GLuint , GLuint , GLsizei , GLsizei *, GLint *, GLenum *, GLchar *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETACTIVEUNIFORMPROC)(GLuint , GLuint , GLsizei , GLsizei *, GLint *, GLenum *, GLchar *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETATTACHEDSHADERSPROC)(GLuint , GLsizei , GLsizei *, GLuint *);
+    typedef GLint (CODEGEN_FUNCPTR *PFNGETATTRIBLOCATIONPROC)(GLuint , const GLchar *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETPROGRAMIVPROC)(GLuint , GLenum , GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETPROGRAMINFOLOGPROC)(GLuint , GLsizei , GLsizei *, GLchar *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETSHADERIVPROC)(GLuint , GLenum , GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETSHADERINFOLOGPROC)(GLuint , GLsizei , GLsizei *, GLchar *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETSHADERSOURCEPROC)(GLuint , GLsizei , GLsizei *, GLchar *);
+    typedef GLint (CODEGEN_FUNCPTR *PFNGETUNIFORMLOCATIONPROC)(GLuint , const GLchar *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETUNIFORMFVPROC)(GLuint , GLint , GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETUNIFORMIVPROC)(GLuint , GLint , GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETVERTEXATTRIBDVPROC)(GLuint , GLenum , GLdouble *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETVERTEXATTRIBFVPROC)(GLuint , GLenum , GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETVERTEXATTRIBIVPROC)(GLuint , GLenum , GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETVERTEXATTRIBPOINTERVPROC)(GLuint , GLenum , GLvoid* *);
+    typedef GLboolean (CODEGEN_FUNCPTR *PFNISPROGRAMPROC)(GLuint );
+    typedef GLboolean (CODEGEN_FUNCPTR *PFNISSHADERPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNLINKPROGRAMPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNSHADERSOURCEPROC)(GLuint , GLsizei , const GLchar* const *, const GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNUSEPROGRAMPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM1FPROC)(GLint , GLfloat );
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM2FPROC)(GLint , GLfloat , GLfloat );
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM3FPROC)(GLint , GLfloat , GLfloat , GLfloat );
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM4FPROC)(GLint , GLfloat , GLfloat , GLfloat , GLfloat );
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM1IPROC)(GLint , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM2IPROC)(GLint , GLint , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM3IPROC)(GLint , GLint , GLint , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM4IPROC)(GLint , GLint , GLint , GLint , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM1FVPROC)(GLint , GLsizei , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM2FVPROC)(GLint , GLsizei , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM3FVPROC)(GLint , GLsizei , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM4FVPROC)(GLint , GLsizei , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM1IVPROC)(GLint , GLsizei , const GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM2IVPROC)(GLint , GLsizei , const GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM3IVPROC)(GLint , GLsizei , const GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM4IVPROC)(GLint , GLsizei , const GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX2FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX3FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX4FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNVALIDATEPROGRAMPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBPOINTERPROC)(GLuint , GLint , GLenum , GLboolean , GLsizei , const GLvoid *);
+
+    // Extension: 2.1
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX2X3FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX3X2FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX2X4FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX4X2FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX3X4FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORMMATRIX4X3FVPROC)(GLint , GLsizei , GLboolean , const GLfloat *);
+
+    // Extension: ARB_vertex_array_object
+    typedef void (CODEGEN_FUNCPTR *PFNBINDVERTEXARRAYPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNDELETEVERTEXARRAYSPROC)(GLsizei , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGENVERTEXARRAYSPROC)(GLsizei , GLuint *);
+    typedef GLboolean (CODEGEN_FUNCPTR *PFNISVERTEXARRAYPROC)(GLuint );
+
+    // Extension: ARB_map_buffer_range
+    typedef GLvoid* (CODEGEN_FUNCPTR *PFNMAPBUFFERRANGEPROC)(GLenum , GLintptr , GLsizeiptr , GLbitfield );
+    typedef void (CODEGEN_FUNCPTR *PFNFLUSHMAPPEDBUFFERRANGEPROC)(GLenum , GLintptr , GLsizeiptr );
+
+    // Extension: ARB_framebuffer_object
+    typedef GLboolean (CODEGEN_FUNCPTR *PFNISRENDERBUFFERPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNBINDRENDERBUFFERPROC)(GLenum , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNDELETERENDERBUFFERSPROC)(GLsizei , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGENRENDERBUFFERSPROC)(GLsizei , GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNRENDERBUFFERSTORAGEPROC)(GLenum , GLenum , GLsizei , GLsizei );
+    typedef void (CODEGEN_FUNCPTR *PFNGETRENDERBUFFERPARAMETERIVPROC)(GLenum , GLenum , GLint *);
+    typedef GLboolean (CODEGEN_FUNCPTR *PFNISFRAMEBUFFERPROC)(GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNBINDFRAMEBUFFERPROC)(GLenum , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNDELETEFRAMEBUFFERSPROC)(GLsizei , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGENFRAMEBUFFERSPROC)(GLsizei , GLuint *);
+    typedef GLenum (CODEGEN_FUNCPTR *PFNCHECKFRAMEBUFFERSTATUSPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNFRAMEBUFFERTEXTURE1DPROC)(GLenum , GLenum , GLenum , GLuint , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNFRAMEBUFFERTEXTURE2DPROC)(GLenum , GLenum , GLenum , GLuint , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNFRAMEBUFFERTEXTURE3DPROC)(GLenum , GLenum , GLenum , GLuint , GLint , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNFRAMEBUFFERRENDERBUFFERPROC)(GLenum , GLenum , GLenum , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC)(GLenum , GLenum , GLenum , GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGENERATEMIPMAPPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNBLITFRAMEBUFFERPROC)(GLint , GLint , GLint , GLint , GLint , GLint , GLint , GLint , GLbitfield , GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNRENDERBUFFERSTORAGEMULTISAMPLEPROC)(GLenum , GLsizei , GLenum , GLsizei , GLsizei );
+    typedef void (CODEGEN_FUNCPTR *PFNFRAMEBUFFERTEXTURELAYERPROC)(GLenum , GLenum , GLuint , GLint , GLint );
+
+    // Extension: 3.0
+    typedef void (CODEGEN_FUNCPTR *PFNCOLORMASKIPROC)(GLuint , GLboolean , GLboolean , GLboolean , GLboolean );
+    typedef void (CODEGEN_FUNCPTR *PFNGETBOOLEANI_VPROC)(GLenum , GLuint , GLboolean *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETINTEGERI_VPROC)(GLenum , GLuint , GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNENABLEIPROC)(GLenum , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNDISABLEIPROC)(GLenum , GLuint );
+    typedef GLboolean (CODEGEN_FUNCPTR *PFNISENABLEDIPROC)(GLenum , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNBEGINTRANSFORMFEEDBACKPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNENDTRANSFORMFEEDBACKPROC)();
+    typedef void (CODEGEN_FUNCPTR *PFNBINDBUFFERRANGEPROC)(GLenum , GLuint , GLuint , GLintptr , GLsizeiptr );
+    typedef void (CODEGEN_FUNCPTR *PFNBINDBUFFERBASEPROC)(GLenum , GLuint , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNTRANSFORMFEEDBACKVARYINGSPROC)(GLuint , GLsizei , const GLchar* const *, GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNGETTRANSFORMFEEDBACKVARYINGPROC)(GLuint , GLuint , GLsizei , GLsizei *, GLsizei *, GLenum *, GLchar *);
+    typedef void (CODEGEN_FUNCPTR *PFNCLAMPCOLORPROC)(GLenum , GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNBEGINCONDITIONALRENDERPROC)(GLuint , GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNENDCONDITIONALRENDERPROC)();
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBIPOINTERPROC)(GLuint , GLint , GLenum , GLsizei , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETVERTEXATTRIBIIVPROC)(GLuint , GLenum , GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETVERTEXATTRIBIUIVPROC)(GLuint , GLenum , GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI1IPROC)(GLuint , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI2IPROC)(GLuint , GLint , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI3IPROC)(GLuint , GLint , GLint , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4IPROC)(GLuint , GLint , GLint , GLint , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI1UIPROC)(GLuint , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI2UIPROC)(GLuint , GLuint , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI3UIPROC)(GLuint , GLuint , GLuint , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4UIPROC)(GLuint , GLuint , GLuint , GLuint , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI1IVPROC)(GLuint , const GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI2IVPROC)(GLuint , const GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI3IVPROC)(GLuint , const GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4IVPROC)(GLuint , const GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI1UIVPROC)(GLuint , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI2UIVPROC)(GLuint , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI3UIVPROC)(GLuint , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4UIVPROC)(GLuint , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4BVPROC)(GLuint , const GLbyte *);
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4SVPROC)(GLuint , const GLshort *);
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4UBVPROC)(GLuint , const GLubyte *);
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXATTRIBI4USVPROC)(GLuint , const GLushort *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETUNIFORMUIVPROC)(GLuint , GLint , GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNBINDFRAGDATALOCATIONPROC)(GLuint , GLuint , const GLchar *);
+    typedef GLint (CODEGEN_FUNCPTR *PFNGETFRAGDATALOCATIONPROC)(GLuint , const GLchar *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM1UIPROC)(GLint , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM2UIPROC)(GLint , GLuint , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM3UIPROC)(GLint , GLuint , GLuint , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM4UIPROC)(GLint , GLuint , GLuint , GLuint , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM1UIVPROC)(GLint , GLsizei , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM2UIVPROC)(GLint , GLsizei , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM3UIVPROC)(GLint , GLsizei , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORM4UIVPROC)(GLint , GLsizei , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNTEXPARAMETERIIVPROC)(GLenum , GLenum , const GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNTEXPARAMETERIUIVPROC)(GLenum , GLenum , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETTEXPARAMETERIIVPROC)(GLenum , GLenum , GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETTEXPARAMETERIUIVPROC)(GLenum , GLenum , GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNCLEARBUFFERIVPROC)(GLenum , GLint , const GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNCLEARBUFFERUIVPROC)(GLenum , GLint , const GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNCLEARBUFFERFVPROC)(GLenum , GLint , const GLfloat *);
+    typedef void (CODEGEN_FUNCPTR *PFNCLEARBUFFERFIPROC)(GLenum , GLint , GLfloat , GLint );
+    typedef const GLubyte * (CODEGEN_FUNCPTR *PFNGETSTRINGIPROC)(GLenum , GLuint );
+
+    // Extension: ARB_uniform_buffer_object
+    typedef void (CODEGEN_FUNCPTR *PFNGETUNIFORMINDICESPROC)(GLuint , GLsizei , const GLchar* const *, GLuint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETACTIVEUNIFORMSIVPROC)(GLuint , GLsizei , const GLuint *, GLenum , GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETACTIVEUNIFORMNAMEPROC)(GLuint , GLuint , GLsizei , GLsizei *, GLchar *);
+    typedef GLuint (CODEGEN_FUNCPTR *PFNGETUNIFORMBLOCKINDEXPROC)(GLuint , const GLchar *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETACTIVEUNIFORMBLOCKIVPROC)(GLuint , GLuint , GLenum , GLint *);
+    typedef void (CODEGEN_FUNCPTR *PFNGETACTIVEUNIFORMBLOCKNAMEPROC)(GLuint , GLuint , GLsizei , GLsizei *, GLchar *);
+    typedef void (CODEGEN_FUNCPTR *PFNUNIFORMBLOCKBINDINGPROC)(GLuint , GLuint , GLuint );
+
+    // Extension: ARB_copy_buffer
+    typedef void (CODEGEN_FUNCPTR *PFNCOPYBUFFERSUBDATAPROC)(GLenum , GLenum , GLintptr , GLintptr , GLsizeiptr );
+
+    // Extension: 3.1
+    typedef void (CODEGEN_FUNCPTR *PFNDRAWARRAYSINSTANCEDPROC)(GLenum , GLint , GLsizei , GLsizei );
+    typedef void (CODEGEN_FUNCPTR *PFNDRAWELEMENTSINSTANCEDPROC)(GLenum , GLsizei , GLenum , const GLvoid *, GLsizei );
+    typedef void (CODEGEN_FUNCPTR *PFNTEXBUFFERPROC)(GLenum , GLenum , GLuint );
+    typedef void (CODEGEN_FUNCPTR *PFNPRIMITIVERESTARTINDEXPROC)(GLuint );
+
+    // Legacy
+    typedef void (CODEGEN_FUNCPTR *PFNENABLECLIENTSTATEPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNDISABLECLIENTSTATEPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNVERTEXPOINTERPROC)(GLint , GLenum , GLsizei , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNNORMALPOINTERPROC)(GLenum , GLsizei , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNCOLORPOINTERPROC)(GLint , GLenum , GLsizei , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNTEXCOORDPOINTERPROC)(GLint , GLenum , GLsizei , const GLvoid *);
+    typedef void (CODEGEN_FUNCPTR *PFNTEXENVIPROC)(GLenum , GLenum , GLint );
+    typedef void (CODEGEN_FUNCPTR *PFNMATRIXMODEPROC)(GLenum );
+    typedef void (CODEGEN_FUNCPTR *PFNLOADIDENTITYPROC)(void);
+    typedef void (CODEGEN_FUNCPTR *PFNORTHOPROC)(GLdouble , GLdouble , GLdouble , GLdouble , GLdouble , GLdouble );
+    typedef void (CODEGEN_FUNCPTR *PFNCOLOR3DPROC)(GLdouble , GLdouble , GLdouble );
+
+    //////////////////////////////////////////////
+    // Function pointers
+
+    // Extension: 1.1
+    PFNCULLFACEPROC CullFace;
+    PFNFRONTFACEPROC FrontFace;
+    PFNHINTPROC Hint;
+    PFNLINEWIDTHPROC LineWidth;
+    PFNPOINTSIZEPROC PointSize;
+    PFNPOLYGONMODEPROC PolygonMode;
+    PFNSCISSORPROC Scissor;
+    PFNTEXPARAMETERFPROC TexParameterf;
+    PFNTEXPARAMETERFVPROC TexParameterfv;
+    PFNTEXPARAMETERIPROC TexParameteri;
+    PFNTEXPARAMETERIVPROC TexParameteriv;
+    PFNTEXIMAGE1DPROC TexImage1D;
+    PFNTEXIMAGE2DPROC TexImage2D;
+    PFNDRAWBUFFERPROC DrawBuffer;
+    PFNCLEARPROC Clear;
+    PFNCLEARCOLORPROC ClearColor;
+    PFNCLEARSTENCILPROC ClearStencil;
+    PFNCLEARDEPTHPROC ClearDepth;
+    PFNSTENCILMASKPROC StencilMask;
+    PFNCOLORMASKPROC ColorMask;
+    PFNDEPTHMASKPROC DepthMask;
+    PFNDISABLEPROC Disable;
+    PFNENABLEPROC Enable;
+    PFNFINISHPROC Finish;
+    PFNFLUSHPROC Flush;
+    PFNBLENDFUNCPROC BlendFunc;
+    PFNLOGICOPPROC LogicOp;
+    PFNSTENCILFUNCPROC StencilFunc;
+    PFNSTENCILOPPROC StencilOp;
+    PFNDEPTHFUNCPROC DepthFunc;
+    PFNPIXELSTOREFPROC PixelStoref;
+    PFNPIXELSTOREIPROC PixelStorei;
+    PFNREADBUFFERPROC ReadBuffer;
+    PFNREADPIXELSPROC ReadPixels;
+    PFNGETBOOLEANVPROC GetBooleanv;
+    PFNGETDOUBLEVPROC GetDoublev;
+    PFNGETERRORPROC GetError;
+    PFNGETFLOATVPROC GetFloatv;
+    PFNGETINTEGERVPROC GetIntegerv;
+    PFNGETSTRINGPROC GetString;
+    PFNGETTEXIMAGEPROC GetTexImage;
+    PFNGETTEXPARAMETERFVPROC GetTexParameterfv;
+    PFNGETTEXPARAMETERIVPROC GetTexParameteriv;
+    PFNGETTEXLEVELPARAMETERFVPROC GetTexLevelParameterfv;
+    PFNGETTEXLEVELPARAMETERIVPROC GetTexLevelParameteriv;
+    PFNISENABLEDPROC IsEnabled;
+    PFNDEPTHRANGEPROC DepthRange;
+    PFNVIEWPORTPROC Viewport;
+    PFNDRAWARRAYSPROC DrawArrays;
+    PFNDRAWELEMENTSPROC DrawElements;
+    PFNGETPOINTERVPROC GetPointerv;
+    PFNPOLYGONOFFSETPROC PolygonOffset;
+    PFNCOPYTEXIMAGE1DPROC CopyTexImage1D;
+    PFNCOPYTEXIMAGE2DPROC CopyTexImage2D;
+    PFNCOPYTEXSUBIMAGE1DPROC CopyTexSubImage1D;
+    PFNCOPYTEXSUBIMAGE2DPROC CopyTexSubImage2D;
+    PFNTEXSUBIMAGE1DPROC TexSubImage1D;
+    PFNTEXSUBIMAGE2DPROC TexSubImage2D;
+    PFNBINDTEXTUREPROC BindTexture;
+    PFNDELETETEXTURESPROC DeleteTextures;
+    PFNGENTEXTURESPROC GenTextures;
+    PFNISTEXTUREPROC IsTexture;
+    PFNINDEXUBPROC Indexub;
+    PFNINDEXUBVPROC Indexubv;
+
+    // Extension: 1.2
+    PFNBLENDCOLORPROC BlendColor;
+    PFNBLENDEQUATIONPROC BlendEquation;
+    PFNDRAWRANGEELEMENTSPROC DrawRangeElements;
+    PFNTEXSUBIMAGE3DPROC TexSubImage3D;
+    PFNCOPYTEXSUBIMAGE3DPROC CopyTexSubImage3D;
+
+    // Extension: 1.3
+    PFNACTIVETEXTUREPROC ActiveTexture;
+    PFNSAMPLECOVERAGEPROC SampleCoverage;
+    PFNCOMPRESSEDTEXIMAGE3DPROC CompressedTexImage3D;
+    PFNCOMPRESSEDTEXIMAGE2DPROC CompressedTexImage2D;
+    PFNCOMPRESSEDTEXIMAGE1DPROC CompressedTexImage1D;
+    PFNCOMPRESSEDTEXSUBIMAGE3DPROC CompressedTexSubImage3D;
+    PFNCOMPRESSEDTEXSUBIMAGE2DPROC CompressedTexSubImage2D;
+    PFNCOMPRESSEDTEXSUBIMAGE1DPROC CompressedTexSubImage1D;
+    PFNGETCOMPRESSEDTEXIMAGEPROC GetCompressedTexImage;
+
+    // Extension: 1.4
+    PFNBLENDFUNCSEPARATEPROC BlendFuncSeparate;
+    PFNMULTIDRAWARRAYSPROC MultiDrawArrays;
+    PFNMULTIDRAWELEMENTSPROC MultiDrawElements;
+    PFNPOINTPARAMETERFPROC PointParameterf;
+    PFNPOINTPARAMETERFVPROC PointParameterfv;
+    PFNPOINTPARAMETERIPROC PointParameteri;
+    PFNPOINTPARAMETERIVPROC PointParameteriv;
+
+    // Extension: 1.5
+    PFNGENQUERIESPROC GenQueries;
+    PFNDELETEQUERIESPROC DeleteQueries;
+    PFNISQUERYPROC IsQuery;
+    PFNBEGINQUERYPROC BeginQuery;
+    PFNENDQUERYPROC EndQuery;
+    PFNGETQUERYIVPROC GetQueryiv;
+    PFNGETQUERYOBJECTIVPROC GetQueryObjectiv;
+    PFNGETQUERYOBJECTUIVPROC GetQueryObjectuiv;
+    PFNBINDBUFFERPROC BindBuffer;
+    PFNDELETEBUFFERSPROC DeleteBuffers;
+    PFNGENBUFFERSPROC GenBuffers;
+    PFNISBUFFERPROC IsBuffer;
+    PFNBUFFERDATAPROC BufferData;
+    PFNBUFFERSUBDATAPROC BufferSubData;
+    PFNGETBUFFERSUBDATAPROC GetBufferSubData;
+    PFNMAPBUFFERPROC MapBuffer;
+    PFNUNMAPBUFFERPROC UnmapBuffer;
+    PFNGETBUFFERPARAMETERIVPROC GetBufferParameteriv;
+    PFNGETBUFFERPOINTERVPROC GetBufferPointerv;
+
+    // Extension: 2.0
+    PFNBLENDEQUATIONSEPARATEPROC BlendEquationSeparate;
+    PFNDRAWBUFFERSPROC DrawBuffers;
+    PFNSTENCILOPSEPARATEPROC StencilOpSeparate;
+    PFNSTENCILFUNCSEPARATEPROC StencilFuncSeparate;
+    PFNSTENCILMASKSEPARATEPROC StencilMaskSeparate;
+    PFNATTACHSHADERPROC AttachShader;
+    PFNBINDATTRIBLOCATIONPROC BindAttribLocation;
+    PFNCOMPILESHADERPROC CompileShader;
+    PFNCREATEPROGRAMPROC CreateProgram;
+    PFNCREATESHADERPROC CreateShader;
+    PFNDELETEPROGRAMPROC DeleteProgram;
+    PFNDELETESHADERPROC DeleteShader;
+    PFNDETACHSHADERPROC DetachShader;
+    PFNDISABLEVERTEXATTRIBARRAYPROC DisableVertexAttribArray;
+    PFNENABLEVERTEXATTRIBARRAYPROC EnableVertexAttribArray;
+    PFNGETACTIVEATTRIBPROC GetActiveAttrib;
+    PFNGETACTIVEUNIFORMPROC GetActiveUniform;
+    PFNGETATTACHEDSHADERSPROC GetAttachedShaders;
+    PFNGETATTRIBLOCATIONPROC GetAttribLocation;
+    PFNGETPROGRAMIVPROC GetProgramiv;
+    PFNGETPROGRAMINFOLOGPROC GetProgramInfoLog;
+    PFNGETSHADERIVPROC GetShaderiv;
+    PFNGETSHADERINFOLOGPROC GetShaderInfoLog;
+    PFNGETSHADERSOURCEPROC GetShaderSource;
+    PFNGETUNIFORMLOCATIONPROC GetUniformLocation;
+    PFNGETUNIFORMFVPROC GetUniformfv;
+    PFNGETUNIFORMIVPROC GetUniformiv;
+    PFNGETVERTEXATTRIBDVPROC GetVertexAttribdv;
+    PFNGETVERTEXATTRIBFVPROC GetVertexAttribfv;
+    PFNGETVERTEXATTRIBIVPROC GetVertexAttribiv;
+    PFNGETVERTEXATTRIBPOINTERVPROC GetVertexAttribPointerv;
+    PFNISPROGRAMPROC IsProgram;
+    PFNISSHADERPROC IsShader;
+    PFNLINKPROGRAMPROC LinkProgram;
+    PFNSHADERSOURCEPROC ShaderSource;
+    PFNUSEPROGRAMPROC UseProgram;
+    PFNUNIFORM1FPROC Uniform1f;
+    PFNUNIFORM2FPROC Uniform2f;
+    PFNUNIFORM3FPROC Uniform3f;
+    PFNUNIFORM4FPROC Uniform4f;
+    PFNUNIFORM1IPROC Uniform1i;
+    PFNUNIFORM2IPROC Uniform2i;
+    PFNUNIFORM3IPROC Uniform3i;
+    PFNUNIFORM4IPROC Uniform4i;
+    PFNUNIFORM1FVPROC Uniform1fv;
+    PFNUNIFORM2FVPROC Uniform2fv;
+    PFNUNIFORM3FVPROC Uniform3fv;
+    PFNUNIFORM4FVPROC Uniform4fv;
+    PFNUNIFORM1IVPROC Uniform1iv;
+    PFNUNIFORM2IVPROC Uniform2iv;
+    PFNUNIFORM3IVPROC Uniform3iv;
+    PFNUNIFORM4IVPROC Uniform4iv;
+    PFNUNIFORMMATRIX2FVPROC UniformMatrix2fv;
+    PFNUNIFORMMATRIX3FVPROC UniformMatrix3fv;
+    PFNUNIFORMMATRIX4FVPROC UniformMatrix4fv;
+    PFNVALIDATEPROGRAMPROC ValidateProgram;
+    PFNVERTEXATTRIBPOINTERPROC VertexAttribPointer;
+
+    // Extension: 2.1
+    PFNUNIFORMMATRIX2X3FVPROC UniformMatrix2x3fv;
+    PFNUNIFORMMATRIX3X2FVPROC UniformMatrix3x2fv;
+    PFNUNIFORMMATRIX2X4FVPROC UniformMatrix2x4fv;
+    PFNUNIFORMMATRIX4X2FVPROC UniformMatrix4x2fv;
+    PFNUNIFORMMATRIX3X4FVPROC UniformMatrix3x4fv;
+    PFNUNIFORMMATRIX4X3FVPROC UniformMatrix4x3fv;
+
+    // Extension: ARB_vertex_array_object
+    PFNBINDVERTEXARRAYPROC BindVertexArray;
+    PFNDELETEVERTEXARRAYSPROC DeleteVertexArrays;
+    PFNGENVERTEXARRAYSPROC GenVertexArrays;
+    PFNISVERTEXARRAYPROC IsVertexArray;
+
+    // Extension: ARB_map_buffer_range
+    PFNMAPBUFFERRANGEPROC MapBufferRange;
+    PFNFLUSHMAPPEDBUFFERRANGEPROC FlushMappedBufferRange;
+
+    // Extension: ARB_framebuffer_object
+    PFNISRENDERBUFFERPROC IsRenderbuffer;
+    PFNBINDRENDERBUFFERPROC BindRenderbuffer;
+    PFNDELETERENDERBUFFERSPROC DeleteRenderbuffers;
+    PFNGENRENDERBUFFERSPROC GenRenderbuffers;
+    PFNRENDERBUFFERSTORAGEPROC RenderbufferStorage;
+    PFNGETRENDERBUFFERPARAMETERIVPROC GetRenderbufferParameteriv;
+    PFNISFRAMEBUFFERPROC IsFramebuffer;
+    PFNBINDFRAMEBUFFERPROC BindFramebuffer;
+    PFNDELETEFRAMEBUFFERSPROC DeleteFramebuffers;
+    PFNGENFRAMEBUFFERSPROC GenFramebuffers;
+    PFNCHECKFRAMEBUFFERSTATUSPROC CheckFramebufferStatus;
+    PFNFRAMEBUFFERTEXTURE1DPROC FramebufferTexture1D;
+    PFNFRAMEBUFFERTEXTURE2DPROC FramebufferTexture2D;
+    PFNFRAMEBUFFERTEXTURE3DPROC FramebufferTexture3D;
+    PFNFRAMEBUFFERRENDERBUFFERPROC FramebufferRenderbuffer;
+    PFNGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC GetFramebufferAttachmentParameteriv;
+    PFNGENERATEMIPMAPPROC GenerateMipmap;
+    PFNBLITFRAMEBUFFERPROC BlitFramebuffer;
+    PFNRENDERBUFFERSTORAGEMULTISAMPLEPROC RenderbufferStorageMultisample;
+    PFNFRAMEBUFFERTEXTURELAYERPROC FramebufferTextureLayer;
+
+    // Extension: 3.0
+    PFNCOLORMASKIPROC ColorMaski;
+    PFNGETBOOLEANI_VPROC GetBooleani_v;
+    PFNGETINTEGERI_VPROC GetIntegeri_v;
+    PFNENABLEIPROC Enablei;
+    PFNDISABLEIPROC Disablei;
+    PFNISENABLEDIPROC IsEnabledi;
+    PFNBEGINTRANSFORMFEEDBACKPROC BeginTransformFeedback;
+    PFNENDTRANSFORMFEEDBACKPROC EndTransformFeedback;
+    PFNBINDBUFFERRANGEPROC BindBufferRange;
+    PFNBINDBUFFERBASEPROC BindBufferBase;
+    PFNTRANSFORMFEEDBACKVARYINGSPROC TransformFeedbackVaryings;
+    PFNGETTRANSFORMFEEDBACKVARYINGPROC GetTransformFeedbackVarying;
+    PFNCLAMPCOLORPROC ClampColor;
+    PFNBEGINCONDITIONALRENDERPROC BeginConditionalRender;
+    PFNENDCONDITIONALRENDERPROC EndConditionalRender;
+    PFNVERTEXATTRIBIPOINTERPROC VertexAttribIPointer;
+    PFNGETVERTEXATTRIBIIVPROC GetVertexAttribIiv;
+    PFNGETVERTEXATTRIBIUIVPROC GetVertexAttribIuiv;
+    PFNVERTEXATTRIBI1IPROC VertexAttribI1i;
+    PFNVERTEXATTRIBI2IPROC VertexAttribI2i;
+    PFNVERTEXATTRIBI3IPROC VertexAttribI3i;
+    PFNVERTEXATTRIBI4IPROC VertexAttribI4i;
+    PFNVERTEXATTRIBI1UIPROC VertexAttribI1ui;
+    PFNVERTEXATTRIBI2UIPROC VertexAttribI2ui;
+    PFNVERTEXATTRIBI3UIPROC VertexAttribI3ui;
+    PFNVERTEXATTRIBI4UIPROC VertexAttribI4ui;
+    PFNVERTEXATTRIBI1IVPROC VertexAttribI1iv;
+    PFNVERTEXATTRIBI2IVPROC VertexAttribI2iv;
+    PFNVERTEXATTRIBI3IVPROC VertexAttribI3iv;
+    PFNVERTEXATTRIBI4IVPROC VertexAttribI4iv;
+    PFNVERTEXATTRIBI1UIVPROC VertexAttribI1uiv;
+    PFNVERTEXATTRIBI2UIVPROC VertexAttribI2uiv;
+    PFNVERTEXATTRIBI3UIVPROC VertexAttribI3uiv;
+    PFNVERTEXATTRIBI4UIVPROC VertexAttribI4uiv;
+    PFNVERTEXATTRIBI4BVPROC VertexAttribI4bv;
+    PFNVERTEXATTRIBI4SVPROC VertexAttribI4sv;
+    PFNVERTEXATTRIBI4UBVPROC VertexAttribI4ubv;
+    PFNVERTEXATTRIBI4USVPROC VertexAttribI4usv;
+    PFNGETUNIFORMUIVPROC GetUniformuiv;
+    PFNBINDFRAGDATALOCATIONPROC BindFragDataLocation;
+    PFNGETFRAGDATALOCATIONPROC GetFragDataLocation;
+    PFNUNIFORM1UIPROC Uniform1ui;
+    PFNUNIFORM2UIPROC Uniform2ui;
+    PFNUNIFORM3UIPROC Uniform3ui;
+    PFNUNIFORM4UIPROC Uniform4ui;
+    PFNUNIFORM1UIVPROC Uniform1uiv;
+    PFNUNIFORM2UIVPROC Uniform2uiv;
+    PFNUNIFORM3UIVPROC Uniform3uiv;
+    PFNUNIFORM4UIVPROC Uniform4uiv;
+    PFNTEXPARAMETERIIVPROC TexParameterIiv;
+    PFNTEXPARAMETERIUIVPROC TexParameterIuiv;
+    PFNGETTEXPARAMETERIIVPROC GetTexParameterIiv;
+    PFNGETTEXPARAMETERIUIVPROC GetTexParameterIuiv;
+    PFNCLEARBUFFERIVPROC ClearBufferiv;
+    PFNCLEARBUFFERUIVPROC ClearBufferuiv;
+    PFNCLEARBUFFERFVPROC ClearBufferfv;
+    PFNCLEARBUFFERFIPROC ClearBufferfi;
+    PFNGETSTRINGIPROC GetStringi;
+
+    // Extension: ARB_uniform_buffer_object
+    PFNGETUNIFORMINDICESPROC GetUniformIndices;
+    PFNGETACTIVEUNIFORMSIVPROC GetActiveUniformsiv;
+    PFNGETACTIVEUNIFORMNAMEPROC GetActiveUniformName;
+    PFNGETUNIFORMBLOCKINDEXPROC GetUniformBlockIndex;
+    PFNGETACTIVEUNIFORMBLOCKIVPROC GetActiveUniformBlockiv;
+    PFNGETACTIVEUNIFORMBLOCKNAMEPROC GetActiveUniformBlockName;
+    PFNUNIFORMBLOCKBINDINGPROC UniformBlockBinding;
+
+    // Extension: ARB_copy_buffer
+    PFNCOPYBUFFERSUBDATAPROC CopyBufferSubData;
+
+    // Extension: 3.1
+    PFNDRAWARRAYSINSTANCEDPROC DrawArraysInstanced;
+    PFNDRAWELEMENTSINSTANCEDPROC DrawElementsInstanced;
+    PFNTEXBUFFERPROC TexBuffer;
+    PFNPRIMITIVERESTARTINDEXPROC PrimitiveRestartIndex;
+
+    // Legacy
+    PFNENABLECLIENTSTATEPROC EnableClientState;
+    PFNDISABLECLIENTSTATEPROC DisableClientState;
+    PFNVERTEXPOINTERPROC VertexPointer;
+    PFNNORMALPOINTERPROC NormalPointer;
+    PFNCOLORPOINTERPROC ColorPointer;
+    PFNTEXCOORDPOINTERPROC TexCoordPointer;
+
+    PFNTEXENVIPROC TexEnvi;
+
+    PFNMATRIXMODEPROC MatrixMode;
+    PFNLOADIDENTITYPROC LoadIdentity;
+    PFNORTHOPROC Ortho;
+
+    PFNCOLOR3DPROC Color3d;
+
+    //////////////////////////////////////////////
+    // Switch functions
+
+    // Extension: 1.1
+
+    static void CODEGEN_FUNCPTR Switch_CullFace(GLenum mode)
+    {
+        CullFace = (PFNCULLFACEPROC)IntGetProcAddress("glCullFace");
+        CullFace(mode);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_FrontFace(GLenum mode)
+    {
+        FrontFace = (PFNFRONTFACEPROC)IntGetProcAddress("glFrontFace");
+        FrontFace(mode);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Hint(GLenum target, GLenum mode)
+    {
+        Hint = (PFNHINTPROC)IntGetProcAddress("glHint");
+        Hint(target, mode);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_LineWidth(GLfloat width)
+    {
+        LineWidth = (PFNLINEWIDTHPROC)IntGetProcAddress("glLineWidth");
+        LineWidth(width);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_PointSize(GLfloat size)
+    {
+        PointSize = (PFNPOINTSIZEPROC)IntGetProcAddress("glPointSize");
+        PointSize(size);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_PolygonMode(GLenum face, GLenum mode)
+    {
+        PolygonMode = (PFNPOLYGONMODEPROC)IntGetProcAddress("glPolygonMode");
+        PolygonMode(face, mode);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Scissor(GLint x, GLint y, GLsizei width, GLsizei height)
+    {
+        Scissor = (PFNSCISSORPROC)IntGetProcAddress("glScissor");
+        Scissor(x, y, width, height);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TexParameterf(GLenum target, GLenum pname, GLfloat param)
+    {
+        TexParameterf = (PFNTEXPARAMETERFPROC)IntGetProcAddress("glTexParameterf");
+        TexParameterf(target, pname, param);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TexParameterfv(GLenum target, GLenum pname, const GLfloat *params)
+    {
+        TexParameterfv = (PFNTEXPARAMETERFVPROC)IntGetProcAddress("glTexParameterfv");
+        TexParameterfv(target, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TexParameteri(GLenum target, GLenum pname, GLint param)
+    {
+        TexParameteri = (PFNTEXPARAMETERIPROC)IntGetProcAddress("glTexParameteri");
+        TexParameteri(target, pname, param);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TexParameteriv(GLenum target, GLenum pname, const GLint *params)
+    {
+        TexParameteriv = (PFNTEXPARAMETERIVPROC)IntGetProcAddress("glTexParameteriv");
+        TexParameteriv(target, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TexImage1D(GLenum target, GLint level, GLint internalformat, GLsizei width, GLint border, GLenum format, GLenum type, const GLvoid *pixels)
+    {
+        TexImage1D = (PFNTEXIMAGE1DPROC)IntGetProcAddress("glTexImage1D");
+        TexImage1D(target, level, internalformat, width, border, format, type, pixels);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TexImage2D(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const GLvoid *pixels)
+    {
+        TexImage2D = (PFNTEXIMAGE2DPROC)IntGetProcAddress("glTexImage2D");
+        TexImage2D(target, level, internalformat, width, height, border, format, type, pixels);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DrawBuffer(GLenum mode)
+    {
+        DrawBuffer = (PFNDRAWBUFFERPROC)IntGetProcAddress("glDrawBuffer");
+        DrawBuffer(mode);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Clear(GLbitfield mask)
+    {
+        Clear = (PFNCLEARPROC)IntGetProcAddress("glClear");
+        Clear(mask);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_ClearColor(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha)
+    {
+        ClearColor = (PFNCLEARCOLORPROC)IntGetProcAddress("glClearColor");
+        ClearColor(red, green, blue, alpha);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_ClearStencil(GLint s)
+    {
+        ClearStencil = (PFNCLEARSTENCILPROC)IntGetProcAddress("glClearStencil");
+        ClearStencil(s);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_ClearDepth(GLdouble depth)
+    {
+        ClearDepth = (PFNCLEARDEPTHPROC)IntGetProcAddress("glClearDepth");
+        ClearDepth(depth);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_StencilMask(GLuint mask)
+    {
+        StencilMask = (PFNSTENCILMASKPROC)IntGetProcAddress("glStencilMask");
+        StencilMask(mask);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_ColorMask(GLboolean red, GLboolean green, GLboolean blue, GLboolean alpha)
+    {
+        ColorMask = (PFNCOLORMASKPROC)IntGetProcAddress("glColorMask");
+        ColorMask(red, green, blue, alpha);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DepthMask(GLboolean flag)
+    {
+        DepthMask = (PFNDEPTHMASKPROC)IntGetProcAddress("glDepthMask");
+        DepthMask(flag);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Disable(GLenum cap)
+    {
+        Disable = (PFNDISABLEPROC)IntGetProcAddress("glDisable");
+        Disable(cap);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Enable(GLenum cap)
+    {
+        Enable = (PFNENABLEPROC)IntGetProcAddress("glEnable");
+        Enable(cap);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Finish()
+    {
+        Finish = (PFNFINISHPROC)IntGetProcAddress("glFinish");
+        Finish();
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Flush()
+    {
+        Flush = (PFNFLUSHPROC)IntGetProcAddress("glFlush");
+        Flush();
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BlendFunc(GLenum sfactor, GLenum dfactor)
+    {
+        BlendFunc = (PFNBLENDFUNCPROC)IntGetProcAddress("glBlendFunc");
+        BlendFunc(sfactor, dfactor);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_LogicOp(GLenum opcode)
+    {
+        LogicOp = (PFNLOGICOPPROC)IntGetProcAddress("glLogicOp");
+        LogicOp(opcode);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_StencilFunc(GLenum func, GLint ref, GLuint mask)
+    {
+        StencilFunc = (PFNSTENCILFUNCPROC)IntGetProcAddress("glStencilFunc");
+        StencilFunc(func, ref, mask);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_StencilOp(GLenum fail, GLenum zfail, GLenum zpass)
+    {
+        StencilOp = (PFNSTENCILOPPROC)IntGetProcAddress("glStencilOp");
+        StencilOp(fail, zfail, zpass);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DepthFunc(GLenum func)
+    {
+        DepthFunc = (PFNDEPTHFUNCPROC)IntGetProcAddress("glDepthFunc");
+        DepthFunc(func);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_PixelStoref(GLenum pname, GLfloat param)
+    {
+        PixelStoref = (PFNPIXELSTOREFPROC)IntGetProcAddress("glPixelStoref");
+        PixelStoref(pname, param);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_PixelStorei(GLenum pname, GLint param)
+    {
+        PixelStorei = (PFNPIXELSTOREIPROC)IntGetProcAddress("glPixelStorei");
+        PixelStorei(pname, param);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_ReadBuffer(GLenum mode)
+    {
+        ReadBuffer = (PFNREADBUFFERPROC)IntGetProcAddress("glReadBuffer");
+        ReadBuffer(mode);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_ReadPixels(GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, GLvoid *pixels)
+    {
+        ReadPixels = (PFNREADPIXELSPROC)IntGetProcAddress("glReadPixels");
+        ReadPixels(x, y, width, height, format, type, pixels);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetBooleanv(GLenum pname, GLboolean *params)
+    {
+        GetBooleanv = (PFNGETBOOLEANVPROC)IntGetProcAddress("glGetBooleanv");
+        GetBooleanv(pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetDoublev(GLenum pname, GLdouble *params)
+    {
+        GetDoublev = (PFNGETDOUBLEVPROC)IntGetProcAddress("glGetDoublev");
+        GetDoublev(pname, params);
+    }
+
+    static GLenum CODEGEN_FUNCPTR Switch_GetError()
+    {
+        GetError = (PFNGETERRORPROC)IntGetProcAddress("glGetError");
+        return GetError();
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetFloatv(GLenum pname, GLfloat *params)
+    {
+        GetFloatv = (PFNGETFLOATVPROC)IntGetProcAddress("glGetFloatv");
+        GetFloatv(pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetIntegerv(GLenum pname, GLint *params)
+    {
+        GetIntegerv = (PFNGETINTEGERVPROC)IntGetProcAddress("glGetIntegerv");
+        GetIntegerv(pname, params);
+    }
+
+    static const GLubyte * CODEGEN_FUNCPTR Switch_GetString(GLenum name)
+    {
+        GetString = (PFNGETSTRINGPROC)IntGetProcAddress("glGetString");
+        return GetString(name);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetTexImage(GLenum target, GLint level, GLenum format, GLenum type, GLvoid *pixels)
+    {
+        GetTexImage = (PFNGETTEXIMAGEPROC)IntGetProcAddress("glGetTexImage");
+        GetTexImage(target, level, format, type, pixels);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetTexParameterfv(GLenum target, GLenum pname, GLfloat *params)
+    {
+        GetTexParameterfv = (PFNGETTEXPARAMETERFVPROC)IntGetProcAddress("glGetTexParameterfv");
+        GetTexParameterfv(target, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetTexParameteriv(GLenum target, GLenum pname, GLint *params)
+    {
+        GetTexParameteriv = (PFNGETTEXPARAMETERIVPROC)IntGetProcAddress("glGetTexParameteriv");
+        GetTexParameteriv(target, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetTexLevelParameterfv(GLenum target, GLint level, GLenum pname, GLfloat *params)
+    {
+        GetTexLevelParameterfv = (PFNGETTEXLEVELPARAMETERFVPROC)IntGetProcAddress("glGetTexLevelParameterfv");
+        GetTexLevelParameterfv(target, level, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetTexLevelParameteriv(GLenum target, GLint level, GLenum pname, GLint *params)
+    {
+        GetTexLevelParameteriv = (PFNGETTEXLEVELPARAMETERIVPROC)IntGetProcAddress("glGetTexLevelParameteriv");
+        GetTexLevelParameteriv(target, level, pname, params);
+    }
+
+    static GLboolean CODEGEN_FUNCPTR Switch_IsEnabled(GLenum cap)
+    {
+        IsEnabled = (PFNISENABLEDPROC)IntGetProcAddress("glIsEnabled");
+        return IsEnabled(cap);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DepthRange(GLdouble ren_near, GLdouble ren_far)
+    {
+        DepthRange = (PFNDEPTHRANGEPROC)IntGetProcAddress("glDepthRange");
+        DepthRange(ren_near, ren_far);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Viewport(GLint x, GLint y, GLsizei width, GLsizei height)
+    {
+        Viewport = (PFNVIEWPORTPROC)IntGetProcAddress("glViewport");
+        Viewport(x, y, width, height);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DrawArrays(GLenum mode, GLint first, GLsizei count)
+    {
+        DrawArrays = (PFNDRAWARRAYSPROC)IntGetProcAddress("glDrawArrays");
+        DrawArrays(mode, first, count);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DrawElements(GLenum mode, GLsizei count, GLenum type, const GLvoid *indices)
+    {
+        DrawElements = (PFNDRAWELEMENTSPROC)IntGetProcAddress("glDrawElements");
+        DrawElements(mode, count, type, indices);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetPointerv(GLenum pname, GLvoid* *params)
+    {
+        GetPointerv = (PFNGETPOINTERVPROC)IntGetProcAddress("glGetPointerv");
+        GetPointerv(pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_PolygonOffset(GLfloat factor, GLfloat units)
+    {
+        PolygonOffset = (PFNPOLYGONOFFSETPROC)IntGetProcAddress("glPolygonOffset");
+        PolygonOffset(factor, units);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_CopyTexImage1D(GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLint border)
+    {
+        CopyTexImage1D = (PFNCOPYTEXIMAGE1DPROC)IntGetProcAddress("glCopyTexImage1D");
+        CopyTexImage1D(target, level, internalformat, x, y, width, border);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_CopyTexImage2D(GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLsizei height, GLint border)
+    {
+        CopyTexImage2D = (PFNCOPYTEXIMAGE2DPROC)IntGetProcAddress("glCopyTexImage2D");
+        CopyTexImage2D(target, level, internalformat, x, y, width, height, border);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_CopyTexSubImage1D(GLenum target, GLint level, GLint xoffset, GLint x, GLint y, GLsizei width)
+    {
+        CopyTexSubImage1D = (PFNCOPYTEXSUBIMAGE1DPROC)IntGetProcAddress("glCopyTexSubImage1D");
+        CopyTexSubImage1D(target, level, xoffset, x, y, width);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_CopyTexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint x, GLint y, GLsizei width, GLsizei height)
+    {
+        CopyTexSubImage2D = (PFNCOPYTEXSUBIMAGE2DPROC)IntGetProcAddress("glCopyTexSubImage2D");
+        CopyTexSubImage2D(target, level, xoffset, yoffset, x, y, width, height);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TexSubImage1D(GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLenum type, const GLvoid *pixels)
+    {
+        TexSubImage1D = (PFNTEXSUBIMAGE1DPROC)IntGetProcAddress("glTexSubImage1D");
+        TexSubImage1D(target, level, xoffset, width, format, type, pixels);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid *pixels)
+    {
+        TexSubImage2D = (PFNTEXSUBIMAGE2DPROC)IntGetProcAddress("glTexSubImage2D");
+        TexSubImage2D(target, level, xoffset, yoffset, width, height, format, type, pixels);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BindTexture(GLenum target, GLuint texture)
+    {
+        BindTexture = (PFNBINDTEXTUREPROC)IntGetProcAddress("glBindTexture");
+        BindTexture(target, texture);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DeleteTextures(GLsizei n, const GLuint *textures)
+    {
+        DeleteTextures = (PFNDELETETEXTURESPROC)IntGetProcAddress("glDeleteTextures");
+        DeleteTextures(n, textures);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GenTextures(GLsizei n, GLuint *textures)
+    {
+        GenTextures = (PFNGENTEXTURESPROC)IntGetProcAddress("glGenTextures");
+        GenTextures(n, textures);
+    }
+
+    static GLboolean CODEGEN_FUNCPTR Switch_IsTexture(GLuint texture)
+    {
+        IsTexture = (PFNISTEXTUREPROC)IntGetProcAddress("glIsTexture");
+        return IsTexture(texture);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Indexub(GLubyte c)
+    {
+        Indexub = (PFNINDEXUBPROC)IntGetProcAddress("glIndexub");
+        Indexub(c);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Indexubv(const GLubyte *c)
+    {
+        Indexubv = (PFNINDEXUBVPROC)IntGetProcAddress("glIndexubv");
+        Indexubv(c);
+    }
+
+    // Extension: 1.2
+
+    static void CODEGEN_FUNCPTR Switch_BlendColor(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha)
+    {
+        BlendColor = (PFNBLENDCOLORPROC)IntGetProcAddress("glBlendColor");
+        BlendColor(red, green, blue, alpha);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BlendEquation(GLenum mode)
+    {
+        BlendEquation = (PFNBLENDEQUATIONPROC)IntGetProcAddress("glBlendEquation");
+        BlendEquation(mode);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DrawRangeElements(GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid *indices)
+    {
+        DrawRangeElements = (PFNDRAWRANGEELEMENTSPROC)IntGetProcAddress("glDrawRangeElements");
+        DrawRangeElements(mode, start, end, count, type, indices);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TexSubImage3D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid *pixels)
+    {
+        TexSubImage3D = (PFNTEXSUBIMAGE3DPROC)IntGetProcAddress("glTexSubImage3D");
+        TexSubImage3D(target, level, xoffset, yoffset, zoffset, width, height, depth, format, type, pixels);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_CopyTexSubImage3D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height)
+    {
+        CopyTexSubImage3D = (PFNCOPYTEXSUBIMAGE3DPROC)IntGetProcAddress("glCopyTexSubImage3D");
+        CopyTexSubImage3D(target, level, xoffset, yoffset, zoffset, x, y, width, height);
+    }
+
+    // Extension: 1.3
+
+    static void CODEGEN_FUNCPTR Switch_ActiveTexture(GLenum texture)
+    {
+        ActiveTexture = (PFNACTIVETEXTUREPROC)IntGetProcAddress("glActiveTexture");
+        ActiveTexture(texture);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_SampleCoverage(GLfloat value, GLboolean invert)
+    {
+        SampleCoverage = (PFNSAMPLECOVERAGEPROC)IntGetProcAddress("glSampleCoverage");
+        SampleCoverage(value, invert);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_CompressedTexImage3D(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid *data)
+    {
+        CompressedTexImage3D = (PFNCOMPRESSEDTEXIMAGE3DPROC)IntGetProcAddress("glCompressedTexImage3D");
+        CompressedTexImage3D(target, level, internalformat, width, height, depth, border, imageSize, data);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_CompressedTexImage2D(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const GLvoid *data)
+    {
+        CompressedTexImage2D = (PFNCOMPRESSEDTEXIMAGE2DPROC)IntGetProcAddress("glCompressedTexImage2D");
+        CompressedTexImage2D(target, level, internalformat, width, height, border, imageSize, data);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_CompressedTexImage1D(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLint border, GLsizei imageSize, const GLvoid *data)
+    {
+        CompressedTexImage1D = (PFNCOMPRESSEDTEXIMAGE1DPROC)IntGetProcAddress("glCompressedTexImage1D");
+        CompressedTexImage1D(target, level, internalformat, width, border, imageSize, data);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_CompressedTexSubImage3D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const GLvoid *data)
+    {
+        CompressedTexSubImage3D = (PFNCOMPRESSEDTEXSUBIMAGE3DPROC)IntGetProcAddress("glCompressedTexSubImage3D");
+        CompressedTexSubImage3D(target, level, xoffset, yoffset, zoffset, width, height, depth, format, imageSize, data);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_CompressedTexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLsizei imageSize, const GLvoid *data)
+    {
+        CompressedTexSubImage2D = (PFNCOMPRESSEDTEXSUBIMAGE2DPROC)IntGetProcAddress("glCompressedTexSubImage2D");
+        CompressedTexSubImage2D(target, level, xoffset, yoffset, width, height, format, imageSize, data);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_CompressedTexSubImage1D(GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLsizei imageSize, const GLvoid *data)
+    {
+        CompressedTexSubImage1D = (PFNCOMPRESSEDTEXSUBIMAGE1DPROC)IntGetProcAddress("glCompressedTexSubImage1D");
+        CompressedTexSubImage1D(target, level, xoffset, width, format, imageSize, data);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetCompressedTexImage(GLenum target, GLint level, GLvoid *img)
+    {
+        GetCompressedTexImage = (PFNGETCOMPRESSEDTEXIMAGEPROC)IntGetProcAddress("glGetCompressedTexImage");
+        GetCompressedTexImage(target, level, img);
+    }
+
+    // Extension: 1.4
+
+    static void CODEGEN_FUNCPTR Switch_BlendFuncSeparate(GLenum sfactorRGB, GLenum dfactorRGB, GLenum sfactorAlpha, GLenum dfactorAlpha)
+    {
+        BlendFuncSeparate = (PFNBLENDFUNCSEPARATEPROC)IntGetProcAddress("glBlendFuncSeparate");
+        BlendFuncSeparate(sfactorRGB, dfactorRGB, sfactorAlpha, dfactorAlpha);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_MultiDrawArrays(GLenum mode, const GLint *first, const GLsizei *count, GLsizei drawcount)
+    {
+        MultiDrawArrays = (PFNMULTIDRAWARRAYSPROC)IntGetProcAddress("glMultiDrawArrays");
+        MultiDrawArrays(mode, first, count, drawcount);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_MultiDrawElements(GLenum mode, const GLsizei *count, GLenum type, const GLvoid* const *indices, GLsizei drawcount)
+    {
+        MultiDrawElements = (PFNMULTIDRAWELEMENTSPROC)IntGetProcAddress("glMultiDrawElements");
+        MultiDrawElements(mode, count, type, indices, drawcount);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_PointParameterf(GLenum pname, GLfloat param)
+    {
+        PointParameterf = (PFNPOINTPARAMETERFPROC)IntGetProcAddress("glPointParameterf");
+        PointParameterf(pname, param);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_PointParameterfv(GLenum pname, const GLfloat *params)
+    {
+        PointParameterfv = (PFNPOINTPARAMETERFVPROC)IntGetProcAddress("glPointParameterfv");
+        PointParameterfv(pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_PointParameteri(GLenum pname, GLint param)
+    {
+        PointParameteri = (PFNPOINTPARAMETERIPROC)IntGetProcAddress("glPointParameteri");
+        PointParameteri(pname, param);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_PointParameteriv(GLenum pname, const GLint *params)
+    {
+        PointParameteriv = (PFNPOINTPARAMETERIVPROC)IntGetProcAddress("glPointParameteriv");
+        PointParameteriv(pname, params);
+    }
+
+    // Extension: 1.5
+
+    static void CODEGEN_FUNCPTR Switch_GenQueries(GLsizei n, GLuint *ids)
+    {
+        GenQueries = (PFNGENQUERIESPROC)IntGetProcAddress("glGenQueries");
+        GenQueries(n, ids);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DeleteQueries(GLsizei n, const GLuint *ids)
+    {
+        DeleteQueries = (PFNDELETEQUERIESPROC)IntGetProcAddress("glDeleteQueries");
+        DeleteQueries(n, ids);
+    }
+
+    static GLboolean CODEGEN_FUNCPTR Switch_IsQuery(GLuint id)
+    {
+        IsQuery = (PFNISQUERYPROC)IntGetProcAddress("glIsQuery");
+        return IsQuery(id);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BeginQuery(GLenum target, GLuint id)
+    {
+        BeginQuery = (PFNBEGINQUERYPROC)IntGetProcAddress("glBeginQuery");
+        BeginQuery(target, id);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_EndQuery(GLenum target)
+    {
+        EndQuery = (PFNENDQUERYPROC)IntGetProcAddress("glEndQuery");
+        EndQuery(target);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetQueryiv(GLenum target, GLenum pname, GLint *params)
+    {
+        GetQueryiv = (PFNGETQUERYIVPROC)IntGetProcAddress("glGetQueryiv");
+        GetQueryiv(target, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetQueryObjectiv(GLuint id, GLenum pname, GLint *params)
+    {
+        GetQueryObjectiv = (PFNGETQUERYOBJECTIVPROC)IntGetProcAddress("glGetQueryObjectiv");
+        GetQueryObjectiv(id, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetQueryObjectuiv(GLuint id, GLenum pname, GLuint *params)
+    {
+        GetQueryObjectuiv = (PFNGETQUERYOBJECTUIVPROC)IntGetProcAddress("glGetQueryObjectuiv");
+        GetQueryObjectuiv(id, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BindBuffer(GLenum target, GLuint buffer)
+    {
+        BindBuffer = (PFNBINDBUFFERPROC)IntGetProcAddress("glBindBuffer");
+        BindBuffer(target, buffer);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DeleteBuffers(GLsizei n, const GLuint *buffers)
+    {
+        DeleteBuffers = (PFNDELETEBUFFERSPROC)IntGetProcAddress("glDeleteBuffers");
+        DeleteBuffers(n, buffers);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GenBuffers(GLsizei n, GLuint *buffers)
+    {
+        GenBuffers = (PFNGENBUFFERSPROC)IntGetProcAddress("glGenBuffers");
+        GenBuffers(n, buffers);
+    }
+
+    static GLboolean CODEGEN_FUNCPTR Switch_IsBuffer(GLuint buffer)
+    {
+        IsBuffer = (PFNISBUFFERPROC)IntGetProcAddress("glIsBuffer");
+        return IsBuffer(buffer);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BufferData(GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage)
+    {
+        BufferData = (PFNBUFFERDATAPROC)IntGetProcAddress("glBufferData");
+        BufferData(target, size, data, usage);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BufferSubData(GLenum target, GLintptr offset, GLsizeiptr size, const GLvoid *data)
+    {
+        BufferSubData = (PFNBUFFERSUBDATAPROC)IntGetProcAddress("glBufferSubData");
+        BufferSubData(target, offset, size, data);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetBufferSubData(GLenum target, GLintptr offset, GLsizeiptr size, GLvoid *data)
+    {
+        GetBufferSubData = (PFNGETBUFFERSUBDATAPROC)IntGetProcAddress("glGetBufferSubData");
+        GetBufferSubData(target, offset, size, data);
+    }
+
+    static GLvoid* CODEGEN_FUNCPTR Switch_MapBuffer(GLenum target, GLenum access)
+    {
+        MapBuffer = (PFNMAPBUFFERPROC)IntGetProcAddress("glMapBuffer");
+        return MapBuffer(target, access);
+    }
+
+    static GLboolean CODEGEN_FUNCPTR Switch_UnmapBuffer(GLenum target)
+    {
+        UnmapBuffer = (PFNUNMAPBUFFERPROC)IntGetProcAddress("glUnmapBuffer");
+        return UnmapBuffer(target);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetBufferParameteriv(GLenum target, GLenum pname, GLint *params)
+    {
+        GetBufferParameteriv = (PFNGETBUFFERPARAMETERIVPROC)IntGetProcAddress("glGetBufferParameteriv");
+        GetBufferParameteriv(target, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetBufferPointerv(GLenum target, GLenum pname, GLvoid* *params)
+    {
+        GetBufferPointerv = (PFNGETBUFFERPOINTERVPROC)IntGetProcAddress("glGetBufferPointerv");
+        GetBufferPointerv(target, pname, params);
+    }
+
+    // Extension: 2.0
+
+    static void CODEGEN_FUNCPTR Switch_BlendEquationSeparate(GLenum modeRGB, GLenum modeAlpha)
+    {
+        BlendEquationSeparate = (PFNBLENDEQUATIONSEPARATEPROC)IntGetProcAddress("glBlendEquationSeparate");
+        BlendEquationSeparate(modeRGB, modeAlpha);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DrawBuffers(GLsizei n, const GLenum *bufs)
+    {
+        DrawBuffers = (PFNDRAWBUFFERSPROC)IntGetProcAddress("glDrawBuffers");
+        DrawBuffers(n, bufs);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_StencilOpSeparate(GLenum face, GLenum sfail, GLenum dpfail, GLenum dppass)
+    {
+        StencilOpSeparate = (PFNSTENCILOPSEPARATEPROC)IntGetProcAddress("glStencilOpSeparate");
+        StencilOpSeparate(face, sfail, dpfail, dppass);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_StencilFuncSeparate(GLenum face, GLenum func, GLint ref, GLuint mask)
+    {
+        StencilFuncSeparate = (PFNSTENCILFUNCSEPARATEPROC)IntGetProcAddress("glStencilFuncSeparate");
+        StencilFuncSeparate(face, func, ref, mask);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_StencilMaskSeparate(GLenum face, GLuint mask)
+    {
+        StencilMaskSeparate = (PFNSTENCILMASKSEPARATEPROC)IntGetProcAddress("glStencilMaskSeparate");
+        StencilMaskSeparate(face, mask);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_AttachShader(GLuint program, GLuint shader)
+    {
+        AttachShader = (PFNATTACHSHADERPROC)IntGetProcAddress("glAttachShader");
+        AttachShader(program, shader);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BindAttribLocation(GLuint program, GLuint index, const GLchar *name)
+    {
+        BindAttribLocation = (PFNBINDATTRIBLOCATIONPROC)IntGetProcAddress("glBindAttribLocation");
+        BindAttribLocation(program, index, name);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_CompileShader(GLuint shader)
+    {
+        CompileShader = (PFNCOMPILESHADERPROC)IntGetProcAddress("glCompileShader");
+        CompileShader(shader);
+    }
+
+    static GLuint CODEGEN_FUNCPTR Switch_CreateProgram()
+    {
+        CreateProgram = (PFNCREATEPROGRAMPROC)IntGetProcAddress("glCreateProgram");
+        return CreateProgram();
+    }
+
+    static GLuint CODEGEN_FUNCPTR Switch_CreateShader(GLenum type)
+    {
+        CreateShader = (PFNCREATESHADERPROC)IntGetProcAddress("glCreateShader");
+        return CreateShader(type);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DeleteProgram(GLuint program)
+    {
+        DeleteProgram = (PFNDELETEPROGRAMPROC)IntGetProcAddress("glDeleteProgram");
+        DeleteProgram(program);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DeleteShader(GLuint shader)
+    {
+        DeleteShader = (PFNDELETESHADERPROC)IntGetProcAddress("glDeleteShader");
+        DeleteShader(shader);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DetachShader(GLuint program, GLuint shader)
+    {
+        DetachShader = (PFNDETACHSHADERPROC)IntGetProcAddress("glDetachShader");
+        DetachShader(program, shader);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DisableVertexAttribArray(GLuint index)
+    {
+        DisableVertexAttribArray = (PFNDISABLEVERTEXATTRIBARRAYPROC)IntGetProcAddress("glDisableVertexAttribArray");
+        DisableVertexAttribArray(index);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_EnableVertexAttribArray(GLuint index)
+    {
+        EnableVertexAttribArray = (PFNENABLEVERTEXATTRIBARRAYPROC)IntGetProcAddress("glEnableVertexAttribArray");
+        EnableVertexAttribArray(index);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetActiveAttrib(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLint *size, GLenum *type, GLchar *name)
+    {
+        GetActiveAttrib = (PFNGETACTIVEATTRIBPROC)IntGetProcAddress("glGetActiveAttrib");
+        GetActiveAttrib(program, index, bufSize, length, size, type, name);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetActiveUniform(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLint *size, GLenum *type, GLchar *name)
+    {
+        GetActiveUniform = (PFNGETACTIVEUNIFORMPROC)IntGetProcAddress("glGetActiveUniform");
+        GetActiveUniform(program, index, bufSize, length, size, type, name);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetAttachedShaders(GLuint program, GLsizei maxCount, GLsizei *count, GLuint *obj)
+    {
+        GetAttachedShaders = (PFNGETATTACHEDSHADERSPROC)IntGetProcAddress("glGetAttachedShaders");
+        GetAttachedShaders(program, maxCount, count, obj);
+    }
+
+    static GLint CODEGEN_FUNCPTR Switch_GetAttribLocation(GLuint program, const GLchar *name)
+    {
+        GetAttribLocation = (PFNGETATTRIBLOCATIONPROC)IntGetProcAddress("glGetAttribLocation");
+        return GetAttribLocation(program, name);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetProgramiv(GLuint program, GLenum pname, GLint *params)
+    {
+        GetProgramiv = (PFNGETPROGRAMIVPROC)IntGetProcAddress("glGetProgramiv");
+        GetProgramiv(program, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetProgramInfoLog(GLuint program, GLsizei bufSize, GLsizei *length, GLchar *infoLog)
+    {
+        GetProgramInfoLog = (PFNGETPROGRAMINFOLOGPROC)IntGetProcAddress("glGetProgramInfoLog");
+        GetProgramInfoLog(program, bufSize, length, infoLog);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetShaderiv(GLuint shader, GLenum pname, GLint *params)
+    {
+        GetShaderiv = (PFNGETSHADERIVPROC)IntGetProcAddress("glGetShaderiv");
+        GetShaderiv(shader, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetShaderInfoLog(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *infoLog)
+    {
+        GetShaderInfoLog = (PFNGETSHADERINFOLOGPROC)IntGetProcAddress("glGetShaderInfoLog");
+        GetShaderInfoLog(shader, bufSize, length, infoLog);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetShaderSource(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *source)
+    {
+        GetShaderSource = (PFNGETSHADERSOURCEPROC)IntGetProcAddress("glGetShaderSource");
+        GetShaderSource(shader, bufSize, length, source);
+    }
+
+    static GLint CODEGEN_FUNCPTR Switch_GetUniformLocation(GLuint program, const GLchar *name)
+    {
+        GetUniformLocation = (PFNGETUNIFORMLOCATIONPROC)IntGetProcAddress("glGetUniformLocation");
+        return GetUniformLocation(program, name);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetUniformfv(GLuint program, GLint location, GLfloat *params)
+    {
+        GetUniformfv = (PFNGETUNIFORMFVPROC)IntGetProcAddress("glGetUniformfv");
+        GetUniformfv(program, location, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetUniformiv(GLuint program, GLint location, GLint *params)
+    {
+        GetUniformiv = (PFNGETUNIFORMIVPROC)IntGetProcAddress("glGetUniformiv");
+        GetUniformiv(program, location, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetVertexAttribdv(GLuint index, GLenum pname, GLdouble *params)
+    {
+        GetVertexAttribdv = (PFNGETVERTEXATTRIBDVPROC)IntGetProcAddress("glGetVertexAttribdv");
+        GetVertexAttribdv(index, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetVertexAttribfv(GLuint index, GLenum pname, GLfloat *params)
+    {
+        GetVertexAttribfv = (PFNGETVERTEXATTRIBFVPROC)IntGetProcAddress("glGetVertexAttribfv");
+        GetVertexAttribfv(index, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetVertexAttribiv(GLuint index, GLenum pname, GLint *params)
+    {
+        GetVertexAttribiv = (PFNGETVERTEXATTRIBIVPROC)IntGetProcAddress("glGetVertexAttribiv");
+        GetVertexAttribiv(index, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetVertexAttribPointerv(GLuint index, GLenum pname, GLvoid* *pointer)
+    {
+        GetVertexAttribPointerv = (PFNGETVERTEXATTRIBPOINTERVPROC)IntGetProcAddress("glGetVertexAttribPointerv");
+        GetVertexAttribPointerv(index, pname, pointer);
+    }
+
+    static GLboolean CODEGEN_FUNCPTR Switch_IsProgram(GLuint program)
+    {
+        IsProgram = (PFNISPROGRAMPROC)IntGetProcAddress("glIsProgram");
+        return IsProgram(program);
+    }
+
+    static GLboolean CODEGEN_FUNCPTR Switch_IsShader(GLuint shader)
+    {
+        IsShader = (PFNISSHADERPROC)IntGetProcAddress("glIsShader");
+        return IsShader(shader);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_LinkProgram(GLuint program)
+    {
+        LinkProgram = (PFNLINKPROGRAMPROC)IntGetProcAddress("glLinkProgram");
+        LinkProgram(program);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_ShaderSource(GLuint shader, GLsizei count, const GLchar* const *string, const GLint *length)
+    {
+        ShaderSource = (PFNSHADERSOURCEPROC)IntGetProcAddress("glShaderSource");
+        ShaderSource(shader, count, string, length);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_UseProgram(GLuint program)
+    {
+        UseProgram = (PFNUSEPROGRAMPROC)IntGetProcAddress("glUseProgram");
+        UseProgram(program);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform1f(GLint location, GLfloat v0)
+    {
+        Uniform1f = (PFNUNIFORM1FPROC)IntGetProcAddress("glUniform1f");
+        Uniform1f(location, v0);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform2f(GLint location, GLfloat v0, GLfloat v1)
+    {
+        Uniform2f = (PFNUNIFORM2FPROC)IntGetProcAddress("glUniform2f");
+        Uniform2f(location, v0, v1);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform3f(GLint location, GLfloat v0, GLfloat v1, GLfloat v2)
+    {
+        Uniform3f = (PFNUNIFORM3FPROC)IntGetProcAddress("glUniform3f");
+        Uniform3f(location, v0, v1, v2);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform4f(GLint location, GLfloat v0, GLfloat v1, GLfloat v2, GLfloat v3)
+    {
+        Uniform4f = (PFNUNIFORM4FPROC)IntGetProcAddress("glUniform4f");
+        Uniform4f(location, v0, v1, v2, v3);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform1i(GLint location, GLint v0)
+    {
+        Uniform1i = (PFNUNIFORM1IPROC)IntGetProcAddress("glUniform1i");
+        Uniform1i(location, v0);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform2i(GLint location, GLint v0, GLint v1)
+    {
+        Uniform2i = (PFNUNIFORM2IPROC)IntGetProcAddress("glUniform2i");
+        Uniform2i(location, v0, v1);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform3i(GLint location, GLint v0, GLint v1, GLint v2)
+    {
+        Uniform3i = (PFNUNIFORM3IPROC)IntGetProcAddress("glUniform3i");
+        Uniform3i(location, v0, v1, v2);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform4i(GLint location, GLint v0, GLint v1, GLint v2, GLint v3)
+    {
+        Uniform4i = (PFNUNIFORM4IPROC)IntGetProcAddress("glUniform4i");
+        Uniform4i(location, v0, v1, v2, v3);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform1fv(GLint location, GLsizei count, const GLfloat *value)
+    {
+        Uniform1fv = (PFNUNIFORM1FVPROC)IntGetProcAddress("glUniform1fv");
+        Uniform1fv(location, count, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform2fv(GLint location, GLsizei count, const GLfloat *value)
+    {
+        Uniform2fv = (PFNUNIFORM2FVPROC)IntGetProcAddress("glUniform2fv");
+        Uniform2fv(location, count, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform3fv(GLint location, GLsizei count, const GLfloat *value)
+    {
+        Uniform3fv = (PFNUNIFORM3FVPROC)IntGetProcAddress("glUniform3fv");
+        Uniform3fv(location, count, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform4fv(GLint location, GLsizei count, const GLfloat *value)
+    {
+        Uniform4fv = (PFNUNIFORM4FVPROC)IntGetProcAddress("glUniform4fv");
+        Uniform4fv(location, count, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform1iv(GLint location, GLsizei count, const GLint *value)
+    {
+        Uniform1iv = (PFNUNIFORM1IVPROC)IntGetProcAddress("glUniform1iv");
+        Uniform1iv(location, count, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform2iv(GLint location, GLsizei count, const GLint *value)
+    {
+        Uniform2iv = (PFNUNIFORM2IVPROC)IntGetProcAddress("glUniform2iv");
+        Uniform2iv(location, count, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform3iv(GLint location, GLsizei count, const GLint *value)
+    {
+        Uniform3iv = (PFNUNIFORM3IVPROC)IntGetProcAddress("glUniform3iv");
+        Uniform3iv(location, count, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform4iv(GLint location, GLsizei count, const GLint *value)
+    {
+        Uniform4iv = (PFNUNIFORM4IVPROC)IntGetProcAddress("glUniform4iv");
+        Uniform4iv(location, count, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_UniformMatrix2fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value)
+    {
+        UniformMatrix2fv = (PFNUNIFORMMATRIX2FVPROC)IntGetProcAddress("glUniformMatrix2fv");
+        UniformMatrix2fv(location, count, transpose, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_UniformMatrix3fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value)
+    {
+        UniformMatrix3fv = (PFNUNIFORMMATRIX3FVPROC)IntGetProcAddress("glUniformMatrix3fv");
+        UniformMatrix3fv(location, count, transpose, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_UniformMatrix4fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value)
+    {
+        UniformMatrix4fv = (PFNUNIFORMMATRIX4FVPROC)IntGetProcAddress("glUniformMatrix4fv");
+        UniformMatrix4fv(location, count, transpose, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_ValidateProgram(GLuint program)
+    {
+        ValidateProgram = (PFNVALIDATEPROGRAMPROC)IntGetProcAddress("glValidateProgram");
+        ValidateProgram(program);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribPointer(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const GLvoid *pointer)
+    {
+        VertexAttribPointer = (PFNVERTEXATTRIBPOINTERPROC)IntGetProcAddress("glVertexAttribPointer");
+        VertexAttribPointer(index, size, type, normalized, stride, pointer);
+    }
+
+    // Extension: 2.1
+
+    static void CODEGEN_FUNCPTR Switch_UniformMatrix2x3fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value)
+    {
+        UniformMatrix2x3fv = (PFNUNIFORMMATRIX2X3FVPROC)IntGetProcAddress("glUniformMatrix2x3fv");
+        UniformMatrix2x3fv(location, count, transpose, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_UniformMatrix3x2fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value)
+    {
+        UniformMatrix3x2fv = (PFNUNIFORMMATRIX3X2FVPROC)IntGetProcAddress("glUniformMatrix3x2fv");
+        UniformMatrix3x2fv(location, count, transpose, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_UniformMatrix2x4fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value)
+    {
+        UniformMatrix2x4fv = (PFNUNIFORMMATRIX2X4FVPROC)IntGetProcAddress("glUniformMatrix2x4fv");
+        UniformMatrix2x4fv(location, count, transpose, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_UniformMatrix4x2fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value)
+    {
+        UniformMatrix4x2fv = (PFNUNIFORMMATRIX4X2FVPROC)IntGetProcAddress("glUniformMatrix4x2fv");
+        UniformMatrix4x2fv(location, count, transpose, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_UniformMatrix3x4fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value)
+    {
+        UniformMatrix3x4fv = (PFNUNIFORMMATRIX3X4FVPROC)IntGetProcAddress("glUniformMatrix3x4fv");
+        UniformMatrix3x4fv(location, count, transpose, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_UniformMatrix4x3fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value)
+    {
+        UniformMatrix4x3fv = (PFNUNIFORMMATRIX4X3FVPROC)IntGetProcAddress("glUniformMatrix4x3fv");
+        UniformMatrix4x3fv(location, count, transpose, value);
+    }
+
+    // Extension: ARB_vertex_array_object
+
+    static void CODEGEN_FUNCPTR Switch_BindVertexArray(GLuint ren_array)
+    {
+        BindVertexArray = (PFNBINDVERTEXARRAYPROC)IntGetProcAddress("glBindVertexArray");
+        BindVertexArray(ren_array);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DeleteVertexArrays(GLsizei n, const GLuint *arrays)
+    {
+        DeleteVertexArrays = (PFNDELETEVERTEXARRAYSPROC)IntGetProcAddress("glDeleteVertexArrays");
+        DeleteVertexArrays(n, arrays);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GenVertexArrays(GLsizei n, GLuint *arrays)
+    {
+        GenVertexArrays = (PFNGENVERTEXARRAYSPROC)IntGetProcAddress("glGenVertexArrays");
+        GenVertexArrays(n, arrays);
+    }
+
+    static GLboolean CODEGEN_FUNCPTR Switch_IsVertexArray(GLuint ren_array)
+    {
+        IsVertexArray = (PFNISVERTEXARRAYPROC)IntGetProcAddress("glIsVertexArray");
+        return IsVertexArray(ren_array);
+    }
+
+    // Extension: ARB_map_buffer_range
+
+    static GLvoid* CODEGEN_FUNCPTR Switch_MapBufferRange(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access)
+    {
+        MapBufferRange = (PFNMAPBUFFERRANGEPROC)IntGetProcAddress("glMapBufferRange");
+        return MapBufferRange(target, offset, length, access);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_FlushMappedBufferRange(GLenum target, GLintptr offset, GLsizeiptr length)
+    {
+        FlushMappedBufferRange = (PFNFLUSHMAPPEDBUFFERRANGEPROC)IntGetProcAddress("glFlushMappedBufferRange");
+        FlushMappedBufferRange(target, offset, length);
+    }
+
+    // Extension: ARB_framebuffer_object
+
+    static GLboolean CODEGEN_FUNCPTR Switch_IsRenderbuffer(GLuint renderbuffer)
+    {
+        IsRenderbuffer = (PFNISRENDERBUFFERPROC)IntGetProcAddress("glIsRenderbuffer");
+        return IsRenderbuffer(renderbuffer);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BindRenderbuffer(GLenum target, GLuint renderbuffer)
+    {
+        BindRenderbuffer = (PFNBINDRENDERBUFFERPROC)IntGetProcAddress("glBindRenderbuffer");
+        BindRenderbuffer(target, renderbuffer);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DeleteRenderbuffers(GLsizei n, const GLuint *renderbuffers)
+    {
+        DeleteRenderbuffers = (PFNDELETERENDERBUFFERSPROC)IntGetProcAddress("glDeleteRenderbuffers");
+        DeleteRenderbuffers(n, renderbuffers);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GenRenderbuffers(GLsizei n, GLuint *renderbuffers)
+    {
+        GenRenderbuffers = (PFNGENRENDERBUFFERSPROC)IntGetProcAddress("glGenRenderbuffers");
+        GenRenderbuffers(n, renderbuffers);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_RenderbufferStorage(GLenum target, GLenum internalformat, GLsizei width, GLsizei height)
+    {
+        RenderbufferStorage = (PFNRENDERBUFFERSTORAGEPROC)IntGetProcAddress("glRenderbufferStorage");
+        RenderbufferStorage(target, internalformat, width, height);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetRenderbufferParameteriv(GLenum target, GLenum pname, GLint *params)
+    {
+        GetRenderbufferParameteriv = (PFNGETRENDERBUFFERPARAMETERIVPROC)IntGetProcAddress("glGetRenderbufferParameteriv");
+        GetRenderbufferParameteriv(target, pname, params);
+    }
+
+    static GLboolean CODEGEN_FUNCPTR Switch_IsFramebuffer(GLuint framebuffer)
+    {
+        IsFramebuffer = (PFNISFRAMEBUFFERPROC)IntGetProcAddress("glIsFramebuffer");
+        return IsFramebuffer(framebuffer);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BindFramebuffer(GLenum target, GLuint framebuffer)
+    {
+        BindFramebuffer = (PFNBINDFRAMEBUFFERPROC)IntGetProcAddress("glBindFramebuffer");
+        BindFramebuffer(target, framebuffer);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DeleteFramebuffers(GLsizei n, const GLuint *framebuffers)
+    {
+        DeleteFramebuffers = (PFNDELETEFRAMEBUFFERSPROC)IntGetProcAddress("glDeleteFramebuffers");
+        DeleteFramebuffers(n, framebuffers);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GenFramebuffers(GLsizei n, GLuint *framebuffers)
+    {
+        GenFramebuffers = (PFNGENFRAMEBUFFERSPROC)IntGetProcAddress("glGenFramebuffers");
+        GenFramebuffers(n, framebuffers);
+    }
+
+    static GLenum CODEGEN_FUNCPTR Switch_CheckFramebufferStatus(GLenum target)
+    {
+        CheckFramebufferStatus = (PFNCHECKFRAMEBUFFERSTATUSPROC)IntGetProcAddress("glCheckFramebufferStatus");
+        return CheckFramebufferStatus(target);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_FramebufferTexture1D(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level)
+    {
+        FramebufferTexture1D = (PFNFRAMEBUFFERTEXTURE1DPROC)IntGetProcAddress("glFramebufferTexture1D");
+        FramebufferTexture1D(target, attachment, textarget, texture, level);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_FramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level)
+    {
+        FramebufferTexture2D = (PFNFRAMEBUFFERTEXTURE2DPROC)IntGetProcAddress("glFramebufferTexture2D");
+        FramebufferTexture2D(target, attachment, textarget, texture, level);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_FramebufferTexture3D(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLint zoffset)
+    {
+        FramebufferTexture3D = (PFNFRAMEBUFFERTEXTURE3DPROC)IntGetProcAddress("glFramebufferTexture3D");
+        FramebufferTexture3D(target, attachment, textarget, texture, level, zoffset);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_FramebufferRenderbuffer(GLenum target, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer)
+    {
+        FramebufferRenderbuffer = (PFNFRAMEBUFFERRENDERBUFFERPROC)IntGetProcAddress("glFramebufferRenderbuffer");
+        FramebufferRenderbuffer(target, attachment, renderbuffertarget, renderbuffer);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment, GLenum pname, GLint *params)
+    {
+        GetFramebufferAttachmentParameteriv = (PFNGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC)IntGetProcAddress("glGetFramebufferAttachmentParameteriv");
+        GetFramebufferAttachmentParameteriv(target, attachment, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GenerateMipmap(GLenum target)
+    {
+        GenerateMipmap = (PFNGENERATEMIPMAPPROC)IntGetProcAddress("glGenerateMipmap");
+        GenerateMipmap(target);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter)
+    {
+        BlitFramebuffer = (PFNBLITFRAMEBUFFERPROC)IntGetProcAddress("glBlitFramebuffer");
+        BlitFramebuffer(srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, mask, filter);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_RenderbufferStorageMultisample(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height)
+    {
+        RenderbufferStorageMultisample = (PFNRENDERBUFFERSTORAGEMULTISAMPLEPROC)IntGetProcAddress("glRenderbufferStorageMultisample");
+        RenderbufferStorageMultisample(target, samples, internalformat, width, height);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_FramebufferTextureLayer(GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer)
+    {
+        FramebufferTextureLayer = (PFNFRAMEBUFFERTEXTURELAYERPROC)IntGetProcAddress("glFramebufferTextureLayer");
+        FramebufferTextureLayer(target, attachment, texture, level, layer);
+    }
+
+    // Extension: 3.0
+
+    static void CODEGEN_FUNCPTR Switch_ColorMaski(GLuint index, GLboolean r, GLboolean g, GLboolean b, GLboolean a)
+    {
+        ColorMaski = (PFNCOLORMASKIPROC)IntGetProcAddress("glColorMaski");
+        ColorMaski(index, r, g, b, a);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetBooleani_v(GLenum target, GLuint index, GLboolean *data)
+    {
+        GetBooleani_v = (PFNGETBOOLEANI_VPROC)IntGetProcAddress("glGetBooleani_v");
+        GetBooleani_v(target, index, data);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetIntegeri_v(GLenum target, GLuint index, GLint *data)
+    {
+        GetIntegeri_v = (PFNGETINTEGERI_VPROC)IntGetProcAddress("glGetIntegeri_v");
+        GetIntegeri_v(target, index, data);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Enablei(GLenum target, GLuint index)
+    {
+        Enablei = (PFNENABLEIPROC)IntGetProcAddress("glEnablei");
+        Enablei(target, index);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Disablei(GLenum target, GLuint index)
+    {
+        Disablei = (PFNDISABLEIPROC)IntGetProcAddress("glDisablei");
+        Disablei(target, index);
+    }
+
+    static GLboolean CODEGEN_FUNCPTR Switch_IsEnabledi(GLenum target, GLuint index)
+    {
+        IsEnabledi = (PFNISENABLEDIPROC)IntGetProcAddress("glIsEnabledi");
+        return IsEnabledi(target, index);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BeginTransformFeedback(GLenum primitiveMode)
+    {
+        BeginTransformFeedback = (PFNBEGINTRANSFORMFEEDBACKPROC)IntGetProcAddress("glBeginTransformFeedback");
+        BeginTransformFeedback(primitiveMode);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_EndTransformFeedback()
+    {
+        EndTransformFeedback = (PFNENDTRANSFORMFEEDBACKPROC)IntGetProcAddress("glEndTransformFeedback");
+        EndTransformFeedback();
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BindBufferRange(GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size)
+    {
+        BindBufferRange = (PFNBINDBUFFERRANGEPROC)IntGetProcAddress("glBindBufferRange");
+        BindBufferRange(target, index, buffer, offset, size);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BindBufferBase(GLenum target, GLuint index, GLuint buffer)
+    {
+        BindBufferBase = (PFNBINDBUFFERBASEPROC)IntGetProcAddress("glBindBufferBase");
+        BindBufferBase(target, index, buffer);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TransformFeedbackVaryings(GLuint program, GLsizei count, const GLchar* const *varyings, GLenum bufferMode)
+    {
+        TransformFeedbackVaryings = (PFNTRANSFORMFEEDBACKVARYINGSPROC)IntGetProcAddress("glTransformFeedbackVaryings");
+        TransformFeedbackVaryings(program, count, varyings, bufferMode);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetTransformFeedbackVarying(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLsizei *size, GLenum *type, GLchar *name)
+    {
+        GetTransformFeedbackVarying = (PFNGETTRANSFORMFEEDBACKVARYINGPROC)IntGetProcAddress("glGetTransformFeedbackVarying");
+        GetTransformFeedbackVarying(program, index, bufSize, length, size, type, name);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_ClampColor(GLenum target, GLenum clamp)
+    {
+        ClampColor = (PFNCLAMPCOLORPROC)IntGetProcAddress("glClampColor");
+        ClampColor(target, clamp);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BeginConditionalRender(GLuint id, GLenum mode)
+    {
+        BeginConditionalRender = (PFNBEGINCONDITIONALRENDERPROC)IntGetProcAddress("glBeginConditionalRender");
+        BeginConditionalRender(id, mode);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_EndConditionalRender()
+    {
+        EndConditionalRender = (PFNENDCONDITIONALRENDERPROC)IntGetProcAddress("glEndConditionalRender");
+        EndConditionalRender();
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribIPointer(GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid *pointer)
+    {
+        VertexAttribIPointer = (PFNVERTEXATTRIBIPOINTERPROC)IntGetProcAddress("glVertexAttribIPointer");
+        VertexAttribIPointer(index, size, type, stride, pointer);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetVertexAttribIiv(GLuint index, GLenum pname, GLint *params)
+    {
+        GetVertexAttribIiv = (PFNGETVERTEXATTRIBIIVPROC)IntGetProcAddress("glGetVertexAttribIiv");
+        GetVertexAttribIiv(index, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetVertexAttribIuiv(GLuint index, GLenum pname, GLuint *params)
+    {
+        GetVertexAttribIuiv = (PFNGETVERTEXATTRIBIUIVPROC)IntGetProcAddress("glGetVertexAttribIuiv");
+        GetVertexAttribIuiv(index, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI1i(GLuint index, GLint x)
+    {
+        VertexAttribI1i = (PFNVERTEXATTRIBI1IPROC)IntGetProcAddress("glVertexAttribI1i");
+        VertexAttribI1i(index, x);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI2i(GLuint index, GLint x, GLint y)
+    {
+        VertexAttribI2i = (PFNVERTEXATTRIBI2IPROC)IntGetProcAddress("glVertexAttribI2i");
+        VertexAttribI2i(index, x, y);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI3i(GLuint index, GLint x, GLint y, GLint z)
+    {
+        VertexAttribI3i = (PFNVERTEXATTRIBI3IPROC)IntGetProcAddress("glVertexAttribI3i");
+        VertexAttribI3i(index, x, y, z);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI4i(GLuint index, GLint x, GLint y, GLint z, GLint w)
+    {
+        VertexAttribI4i = (PFNVERTEXATTRIBI4IPROC)IntGetProcAddress("glVertexAttribI4i");
+        VertexAttribI4i(index, x, y, z, w);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI1ui(GLuint index, GLuint x)
+    {
+        VertexAttribI1ui = (PFNVERTEXATTRIBI1UIPROC)IntGetProcAddress("glVertexAttribI1ui");
+        VertexAttribI1ui(index, x);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI2ui(GLuint index, GLuint x, GLuint y)
+    {
+        VertexAttribI2ui = (PFNVERTEXATTRIBI2UIPROC)IntGetProcAddress("glVertexAttribI2ui");
+        VertexAttribI2ui(index, x, y);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI3ui(GLuint index, GLuint x, GLuint y, GLuint z)
+    {
+        VertexAttribI3ui = (PFNVERTEXATTRIBI3UIPROC)IntGetProcAddress("glVertexAttribI3ui");
+        VertexAttribI3ui(index, x, y, z);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI4ui(GLuint index, GLuint x, GLuint y, GLuint z, GLuint w)
+    {
+        VertexAttribI4ui = (PFNVERTEXATTRIBI4UIPROC)IntGetProcAddress("glVertexAttribI4ui");
+        VertexAttribI4ui(index, x, y, z, w);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI1iv(GLuint index, const GLint *v)
+    {
+        VertexAttribI1iv = (PFNVERTEXATTRIBI1IVPROC)IntGetProcAddress("glVertexAttribI1iv");
+        VertexAttribI1iv(index, v);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI2iv(GLuint index, const GLint *v)
+    {
+        VertexAttribI2iv = (PFNVERTEXATTRIBI2IVPROC)IntGetProcAddress("glVertexAttribI2iv");
+        VertexAttribI2iv(index, v);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI3iv(GLuint index, const GLint *v)
+    {
+        VertexAttribI3iv = (PFNVERTEXATTRIBI3IVPROC)IntGetProcAddress("glVertexAttribI3iv");
+        VertexAttribI3iv(index, v);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI4iv(GLuint index, const GLint *v)
+    {
+        VertexAttribI4iv = (PFNVERTEXATTRIBI4IVPROC)IntGetProcAddress("glVertexAttribI4iv");
+        VertexAttribI4iv(index, v);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI1uiv(GLuint index, const GLuint *v)
+    {
+        VertexAttribI1uiv = (PFNVERTEXATTRIBI1UIVPROC)IntGetProcAddress("glVertexAttribI1uiv");
+        VertexAttribI1uiv(index, v);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI2uiv(GLuint index, const GLuint *v)
+    {
+        VertexAttribI2uiv = (PFNVERTEXATTRIBI2UIVPROC)IntGetProcAddress("glVertexAttribI2uiv");
+        VertexAttribI2uiv(index, v);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI3uiv(GLuint index, const GLuint *v)
+    {
+        VertexAttribI3uiv = (PFNVERTEXATTRIBI3UIVPROC)IntGetProcAddress("glVertexAttribI3uiv");
+        VertexAttribI3uiv(index, v);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI4uiv(GLuint index, const GLuint *v)
+    {
+        VertexAttribI4uiv = (PFNVERTEXATTRIBI4UIVPROC)IntGetProcAddress("glVertexAttribI4uiv");
+        VertexAttribI4uiv(index, v);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI4bv(GLuint index, const GLbyte *v)
+    {
+        VertexAttribI4bv = (PFNVERTEXATTRIBI4BVPROC)IntGetProcAddress("glVertexAttribI4bv");
+        VertexAttribI4bv(index, v);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI4sv(GLuint index, const GLshort *v)
+    {
+        VertexAttribI4sv = (PFNVERTEXATTRIBI4SVPROC)IntGetProcAddress("glVertexAttribI4sv");
+        VertexAttribI4sv(index, v);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI4ubv(GLuint index, const GLubyte *v)
+    {
+        VertexAttribI4ubv = (PFNVERTEXATTRIBI4UBVPROC)IntGetProcAddress("glVertexAttribI4ubv");
+        VertexAttribI4ubv(index, v);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexAttribI4usv(GLuint index, const GLushort *v)
+    {
+        VertexAttribI4usv = (PFNVERTEXATTRIBI4USVPROC)IntGetProcAddress("glVertexAttribI4usv");
+        VertexAttribI4usv(index, v);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetUniformuiv(GLuint program, GLint location, GLuint *params)
+    {
+        GetUniformuiv = (PFNGETUNIFORMUIVPROC)IntGetProcAddress("glGetUniformuiv");
+        GetUniformuiv(program, location, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_BindFragDataLocation(GLuint program, GLuint color, const GLchar *name)
+    {
+        BindFragDataLocation = (PFNBINDFRAGDATALOCATIONPROC)IntGetProcAddress("glBindFragDataLocation");
+        BindFragDataLocation(program, color, name);
+    }
+
+    static GLint CODEGEN_FUNCPTR Switch_GetFragDataLocation(GLuint program, const GLchar *name)
+    {
+        GetFragDataLocation = (PFNGETFRAGDATALOCATIONPROC)IntGetProcAddress("glGetFragDataLocation");
+        return GetFragDataLocation(program, name);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform1ui(GLint location, GLuint v0)
+    {
+        Uniform1ui = (PFNUNIFORM1UIPROC)IntGetProcAddress("glUniform1ui");
+        Uniform1ui(location, v0);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform2ui(GLint location, GLuint v0, GLuint v1)
+    {
+        Uniform2ui = (PFNUNIFORM2UIPROC)IntGetProcAddress("glUniform2ui");
+        Uniform2ui(location, v0, v1);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform3ui(GLint location, GLuint v0, GLuint v1, GLuint v2)
+    {
+        Uniform3ui = (PFNUNIFORM3UIPROC)IntGetProcAddress("glUniform3ui");
+        Uniform3ui(location, v0, v1, v2);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform4ui(GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3)
+    {
+        Uniform4ui = (PFNUNIFORM4UIPROC)IntGetProcAddress("glUniform4ui");
+        Uniform4ui(location, v0, v1, v2, v3);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform1uiv(GLint location, GLsizei count, const GLuint *value)
+    {
+        Uniform1uiv = (PFNUNIFORM1UIVPROC)IntGetProcAddress("glUniform1uiv");
+        Uniform1uiv(location, count, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform2uiv(GLint location, GLsizei count, const GLuint *value)
+    {
+        Uniform2uiv = (PFNUNIFORM2UIVPROC)IntGetProcAddress("glUniform2uiv");
+        Uniform2uiv(location, count, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform3uiv(GLint location, GLsizei count, const GLuint *value)
+    {
+        Uniform3uiv = (PFNUNIFORM3UIVPROC)IntGetProcAddress("glUniform3uiv");
+        Uniform3uiv(location, count, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Uniform4uiv(GLint location, GLsizei count, const GLuint *value)
+    {
+        Uniform4uiv = (PFNUNIFORM4UIVPROC)IntGetProcAddress("glUniform4uiv");
+        Uniform4uiv(location, count, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TexParameterIiv(GLenum target, GLenum pname, const GLint *params)
+    {
+        TexParameterIiv = (PFNTEXPARAMETERIIVPROC)IntGetProcAddress("glTexParameterIiv");
+        TexParameterIiv(target, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TexParameterIuiv(GLenum target, GLenum pname, const GLuint *params)
+    {
+        TexParameterIuiv = (PFNTEXPARAMETERIUIVPROC)IntGetProcAddress("glTexParameterIuiv");
+        TexParameterIuiv(target, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetTexParameterIiv(GLenum target, GLenum pname, GLint *params)
+    {
+        GetTexParameterIiv = (PFNGETTEXPARAMETERIIVPROC)IntGetProcAddress("glGetTexParameterIiv");
+        GetTexParameterIiv(target, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetTexParameterIuiv(GLenum target, GLenum pname, GLuint *params)
+    {
+        GetTexParameterIuiv = (PFNGETTEXPARAMETERIUIVPROC)IntGetProcAddress("glGetTexParameterIuiv");
+        GetTexParameterIuiv(target, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_ClearBufferiv(GLenum buffer, GLint drawbuffer, const GLint *value)
+    {
+        ClearBufferiv = (PFNCLEARBUFFERIVPROC)IntGetProcAddress("glClearBufferiv");
+        ClearBufferiv(buffer, drawbuffer, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_ClearBufferuiv(GLenum buffer, GLint drawbuffer, const GLuint *value)
+    {
+        ClearBufferuiv = (PFNCLEARBUFFERUIVPROC)IntGetProcAddress("glClearBufferuiv");
+        ClearBufferuiv(buffer, drawbuffer, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_ClearBufferfv(GLenum buffer, GLint drawbuffer, const GLfloat *value)
+    {
+        ClearBufferfv = (PFNCLEARBUFFERFVPROC)IntGetProcAddress("glClearBufferfv");
+        ClearBufferfv(buffer, drawbuffer, value);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_ClearBufferfi(GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil)
+    {
+        ClearBufferfi = (PFNCLEARBUFFERFIPROC)IntGetProcAddress("glClearBufferfi");
+        ClearBufferfi(buffer, drawbuffer, depth, stencil);
+    }
+
+    static const GLubyte * CODEGEN_FUNCPTR Switch_GetStringi(GLenum name, GLuint index)
+    {
+        GetStringi = (PFNGETSTRINGIPROC)IntGetProcAddress("glGetStringi");
+        return GetStringi(name, index);
+    }
+
+    // Extension: ARB_uniform_buffer_object
+
+    static void CODEGEN_FUNCPTR Switch_GetUniformIndices(GLuint program, GLsizei uniformCount, const GLchar* const *uniformNames, GLuint *uniformIndices)
+    {
+        GetUniformIndices = (PFNGETUNIFORMINDICESPROC)IntGetProcAddress("glGetUniformIndices");
+        GetUniformIndices(program, uniformCount, uniformNames, uniformIndices);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetActiveUniformsiv(GLuint program, GLsizei uniformCount, const GLuint *uniformIndices, GLenum pname, GLint *params)
+    {
+        GetActiveUniformsiv = (PFNGETACTIVEUNIFORMSIVPROC)IntGetProcAddress("glGetActiveUniformsiv");
+        GetActiveUniformsiv(program, uniformCount, uniformIndices, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetActiveUniformName(GLuint program, GLuint uniformIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformName)
+    {
+        GetActiveUniformName = (PFNGETACTIVEUNIFORMNAMEPROC)IntGetProcAddress("glGetActiveUniformName");
+        GetActiveUniformName(program, uniformIndex, bufSize, length, uniformName);
+    }
+
+    static GLuint CODEGEN_FUNCPTR Switch_GetUniformBlockIndex(GLuint program, const GLchar *uniformBlockName)
+    {
+        GetUniformBlockIndex = (PFNGETUNIFORMBLOCKINDEXPROC)IntGetProcAddress("glGetUniformBlockIndex");
+        return GetUniformBlockIndex(program, uniformBlockName);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetActiveUniformBlockiv(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params)
+    {
+        GetActiveUniformBlockiv = (PFNGETACTIVEUNIFORMBLOCKIVPROC)IntGetProcAddress("glGetActiveUniformBlockiv");
+        GetActiveUniformBlockiv(program, uniformBlockIndex, pname, params);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_GetActiveUniformBlockName(GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformBlockName)
+    {
+        GetActiveUniformBlockName = (PFNGETACTIVEUNIFORMBLOCKNAMEPROC)IntGetProcAddress("glGetActiveUniformBlockName");
+        GetActiveUniformBlockName(program, uniformBlockIndex, bufSize, length, uniformBlockName);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_UniformBlockBinding(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding)
+    {
+        UniformBlockBinding = (PFNUNIFORMBLOCKBINDINGPROC)IntGetProcAddress("glUniformBlockBinding");
+        UniformBlockBinding(program, uniformBlockIndex, uniformBlockBinding);
+    }
+
+    // Extension: ARB_copy_buffer
+
+    static void CODEGEN_FUNCPTR Switch_CopyBufferSubData(GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size)
+    {
+        CopyBufferSubData = (PFNCOPYBUFFERSUBDATAPROC)IntGetProcAddress("glCopyBufferSubData");
+        CopyBufferSubData(readTarget, writeTarget, readOffset, writeOffset, size);
+    }
+
+    // Extension: 3.1
+
+    static void CODEGEN_FUNCPTR Switch_DrawArraysInstanced(GLenum mode, GLint first, GLsizei count, GLsizei instancecount)
+    {
+        DrawArraysInstanced = (PFNDRAWARRAYSINSTANCEDPROC)IntGetProcAddress("glDrawArraysInstanced");
+        DrawArraysInstanced(mode, first, count, instancecount);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type, const GLvoid *indices, GLsizei instancecount)
+    {
+        DrawElementsInstanced = (PFNDRAWELEMENTSINSTANCEDPROC)IntGetProcAddress("glDrawElementsInstanced");
+        DrawElementsInstanced(mode, count, type, indices, instancecount);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TexBuffer(GLenum target, GLenum internalformat, GLuint buffer)
+    {
+        TexBuffer = (PFNTEXBUFFERPROC)IntGetProcAddress("glTexBuffer");
+        TexBuffer(target, internalformat, buffer);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_PrimitiveRestartIndex(GLuint index)
+    {
+        PrimitiveRestartIndex = (PFNPRIMITIVERESTARTINDEXPROC)IntGetProcAddress("glPrimitiveRestartIndex");
+        PrimitiveRestartIndex(index);
+    }
+
+    // Legacy
+
+    static void CODEGEN_FUNCPTR Switch_EnableClientState(GLenum cap)
+    {
+        EnableClientState = (PFNENABLECLIENTSTATEPROC)IntGetProcAddress("glEnableClientState");
+        EnableClientState(cap);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_DisableClientState(GLenum cap)
+    {
+        DisableClientState = (PFNDISABLECLIENTSTATEPROC)IntGetProcAddress("glDisableClientState");
+        DisableClientState(cap);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_VertexPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr)
+    {
+        VertexPointer = (PFNVERTEXPOINTERPROC)IntGetProcAddress("glVertexPointer");
+        VertexPointer(size, type, stride, ptr);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_NormalPointer(GLenum type, GLsizei stride, const GLvoid *ptr)
+    {
+        NormalPointer = (PFNNORMALPOINTERPROC)IntGetProcAddress("glNormalPointer");
+        NormalPointer(type, stride, ptr);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_ColorPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr)
+    {
+        ColorPointer = (PFNCOLORPOINTERPROC)IntGetProcAddress("glColorPointer");
+        ColorPointer(size, type, stride, ptr);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TexCoordPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr)
+    {
+        TexCoordPointer = (PFNTEXCOORDPOINTERPROC)IntGetProcAddress("glTexCoordPointer");
+        TexCoordPointer(size, type, stride, ptr);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_TexEnvi(GLenum target, GLenum pname, GLint param)
+    {
+        TexEnvi = (PFNTEXENVIPROC)IntGetProcAddress("glTexEnvi");
+        TexEnvi(target, pname, param);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_MatrixMode(GLenum mode)
+    {
+        MatrixMode = (PFNMATRIXMODEPROC)IntGetProcAddress("glMatrixMode");
+        MatrixMode(mode);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_LoadIdentity(void)
+    {
+        LoadIdentity = (PFNLOADIDENTITYPROC)IntGetProcAddress("glLoadIdentity");
+        LoadIdentity();
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Ortho(GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble near_val, GLdouble far_val)
+    {
+        Ortho = (PFNORTHOPROC)IntGetProcAddress("glOrtho");
+        Ortho(left, right, bottom, top, near_val, far_val);
+    }
+
+    static void CODEGEN_FUNCPTR Switch_Color3d(GLdouble red, GLdouble green, GLdouble blue)
+    {
+        Color3d = (PFNCOLOR3DPROC)IntGetProcAddress("glColor3d");
+        Color3d(red, green, blue);
+    }
+
+    struct InitializeVariables
+    {
+        InitializeVariables()
+        {
+            // Extension: 1.1
+            CullFace = Switch_CullFace;
+            FrontFace = Switch_FrontFace;
+            Hint = Switch_Hint;
+            LineWidth = Switch_LineWidth;
+            PointSize = Switch_PointSize;
+            PolygonMode = Switch_PolygonMode;
+            Scissor = Switch_Scissor;
+            TexParameterf = Switch_TexParameterf;
+            TexParameterfv = Switch_TexParameterfv;
+            TexParameteri = Switch_TexParameteri;
+            TexParameteriv = Switch_TexParameteriv;
+            TexImage1D = Switch_TexImage1D;
+            TexImage2D = Switch_TexImage2D;
+            DrawBuffer = Switch_DrawBuffer;
+            Clear = Switch_Clear;
+            ClearColor = Switch_ClearColor;
+            ClearStencil = Switch_ClearStencil;
+            ClearDepth = Switch_ClearDepth;
+            StencilMask = Switch_StencilMask;
+            ColorMask = Switch_ColorMask;
+            DepthMask = Switch_DepthMask;
+            Disable = Switch_Disable;
+            Enable = Switch_Enable;
+            Finish = Switch_Finish;
+            Flush = Switch_Flush;
+            BlendFunc = Switch_BlendFunc;
+            LogicOp = Switch_LogicOp;
+            StencilFunc = Switch_StencilFunc;
+            StencilOp = Switch_StencilOp;
+            DepthFunc = Switch_DepthFunc;
+            PixelStoref = Switch_PixelStoref;
+            PixelStorei = Switch_PixelStorei;
+            ReadBuffer = Switch_ReadBuffer;
+            ReadPixels = Switch_ReadPixels;
+            GetBooleanv = Switch_GetBooleanv;
+            GetDoublev = Switch_GetDoublev;
+            GetError = Switch_GetError;
+            GetFloatv = Switch_GetFloatv;
+            GetIntegerv = Switch_GetIntegerv;
+            GetString = Switch_GetString;
+            GetTexImage = Switch_GetTexImage;
+            GetTexParameterfv = Switch_GetTexParameterfv;
+            GetTexParameteriv = Switch_GetTexParameteriv;
+            GetTexLevelParameterfv = Switch_GetTexLevelParameterfv;
+            GetTexLevelParameteriv = Switch_GetTexLevelParameteriv;
+            IsEnabled = Switch_IsEnabled;
+            DepthRange = Switch_DepthRange;
+            Viewport = Switch_Viewport;
+            DrawArrays = Switch_DrawArrays;
+            DrawElements = Switch_DrawElements;
+            GetPointerv = Switch_GetPointerv;
+            PolygonOffset = Switch_PolygonOffset;
+            CopyTexImage1D = Switch_CopyTexImage1D;
+            CopyTexImage2D = Switch_CopyTexImage2D;
+            CopyTexSubImage1D = Switch_CopyTexSubImage1D;
+            CopyTexSubImage2D = Switch_CopyTexSubImage2D;
+            TexSubImage1D = Switch_TexSubImage1D;
+            TexSubImage2D = Switch_TexSubImage2D;
+            BindTexture = Switch_BindTexture;
+            DeleteTextures = Switch_DeleteTextures;
+            GenTextures = Switch_GenTextures;
+            IsTexture = Switch_IsTexture;
+            Indexub = Switch_Indexub;
+            Indexubv = Switch_Indexubv;
+
+            // Extension: 1.2
+            BlendColor = Switch_BlendColor;
+            BlendEquation = Switch_BlendEquation;
+            DrawRangeElements = Switch_DrawRangeElements;
+            TexSubImage3D = Switch_TexSubImage3D;
+            CopyTexSubImage3D = Switch_CopyTexSubImage3D;
+
+            // Extension: 1.3
+            ActiveTexture = Switch_ActiveTexture;
+            SampleCoverage = Switch_SampleCoverage;
+            CompressedTexImage3D = Switch_CompressedTexImage3D;
+            CompressedTexImage2D = Switch_CompressedTexImage2D;
+            CompressedTexImage1D = Switch_CompressedTexImage1D;
+            CompressedTexSubImage3D = Switch_CompressedTexSubImage3D;
+            CompressedTexSubImage2D = Switch_CompressedTexSubImage2D;
+            CompressedTexSubImage1D = Switch_CompressedTexSubImage1D;
+            GetCompressedTexImage = Switch_GetCompressedTexImage;
+
+            // Extension: 1.4
+            BlendFuncSeparate = Switch_BlendFuncSeparate;
+            MultiDrawArrays = Switch_MultiDrawArrays;
+            MultiDrawElements = Switch_MultiDrawElements;
+            PointParameterf = Switch_PointParameterf;
+            PointParameterfv = Switch_PointParameterfv;
+            PointParameteri = Switch_PointParameteri;
+            PointParameteriv = Switch_PointParameteriv;
+
+            // Extension: 1.5
+            GenQueries = Switch_GenQueries;
+            DeleteQueries = Switch_DeleteQueries;
+            IsQuery = Switch_IsQuery;
+            BeginQuery = Switch_BeginQuery;
+            EndQuery = Switch_EndQuery;
+            GetQueryiv = Switch_GetQueryiv;
+            GetQueryObjectiv = Switch_GetQueryObjectiv;
+            GetQueryObjectuiv = Switch_GetQueryObjectuiv;
+            BindBuffer = Switch_BindBuffer;
+            DeleteBuffers = Switch_DeleteBuffers;
+            GenBuffers = Switch_GenBuffers;
+            IsBuffer = Switch_IsBuffer;
+            BufferData = Switch_BufferData;
+            BufferSubData = Switch_BufferSubData;
+            GetBufferSubData = Switch_GetBufferSubData;
+            MapBuffer = Switch_MapBuffer;
+            UnmapBuffer = Switch_UnmapBuffer;
+            GetBufferParameteriv = Switch_GetBufferParameteriv;
+            GetBufferPointerv = Switch_GetBufferPointerv;
+
+            // Extension: 2.0
+            BlendEquationSeparate = Switch_BlendEquationSeparate;
+            DrawBuffers = Switch_DrawBuffers;
+            StencilOpSeparate = Switch_StencilOpSeparate;
+            StencilFuncSeparate = Switch_StencilFuncSeparate;
+            StencilMaskSeparate = Switch_StencilMaskSeparate;
+            AttachShader = Switch_AttachShader;
+            BindAttribLocation = Switch_BindAttribLocation;
+            CompileShader = Switch_CompileShader;
+            CreateProgram = Switch_CreateProgram;
+            CreateShader = Switch_CreateShader;
+            DeleteProgram = Switch_DeleteProgram;
+            DeleteShader = Switch_DeleteShader;
+            DetachShader = Switch_DetachShader;
+            DisableVertexAttribArray = Switch_DisableVertexAttribArray;
+            EnableVertexAttribArray = Switch_EnableVertexAttribArray;
+            GetActiveAttrib = Switch_GetActiveAttrib;
+            GetActiveUniform = Switch_GetActiveUniform;
+            GetAttachedShaders = Switch_GetAttachedShaders;
+            GetAttribLocation = Switch_GetAttribLocation;
+            GetProgramiv = Switch_GetProgramiv;
+            GetProgramInfoLog = Switch_GetProgramInfoLog;
+            GetShaderiv = Switch_GetShaderiv;
+            GetShaderInfoLog = Switch_GetShaderInfoLog;
+            GetShaderSource = Switch_GetShaderSource;
+            GetUniformLocation = Switch_GetUniformLocation;
+            GetUniformfv = Switch_GetUniformfv;
+            GetUniformiv = Switch_GetUniformiv;
+            GetVertexAttribdv = Switch_GetVertexAttribdv;
+            GetVertexAttribfv = Switch_GetVertexAttribfv;
+            GetVertexAttribiv = Switch_GetVertexAttribiv;
+            GetVertexAttribPointerv = Switch_GetVertexAttribPointerv;
+            IsProgram = Switch_IsProgram;
+            IsShader = Switch_IsShader;
+            LinkProgram = Switch_LinkProgram;
+            ShaderSource = Switch_ShaderSource;
+            UseProgram = Switch_UseProgram;
+            Uniform1f = Switch_Uniform1f;
+            Uniform2f = Switch_Uniform2f;
+            Uniform3f = Switch_Uniform3f;
+            Uniform4f = Switch_Uniform4f;
+            Uniform1i = Switch_Uniform1i;
+            Uniform2i = Switch_Uniform2i;
+            Uniform3i = Switch_Uniform3i;
+            Uniform4i = Switch_Uniform4i;
+            Uniform1fv = Switch_Uniform1fv;
+            Uniform2fv = Switch_Uniform2fv;
+            Uniform3fv = Switch_Uniform3fv;
+            Uniform4fv = Switch_Uniform4fv;
+            Uniform1iv = Switch_Uniform1iv;
+            Uniform2iv = Switch_Uniform2iv;
+            Uniform3iv = Switch_Uniform3iv;
+            Uniform4iv = Switch_Uniform4iv;
+            UniformMatrix2fv = Switch_UniformMatrix2fv;
+            UniformMatrix3fv = Switch_UniformMatrix3fv;
+            UniformMatrix4fv = Switch_UniformMatrix4fv;
+            ValidateProgram = Switch_ValidateProgram;
+            VertexAttribPointer = Switch_VertexAttribPointer;
+
+            // Extension: 2.1
+            UniformMatrix2x3fv = Switch_UniformMatrix2x3fv;
+            UniformMatrix3x2fv = Switch_UniformMatrix3x2fv;
+            UniformMatrix2x4fv = Switch_UniformMatrix2x4fv;
+            UniformMatrix4x2fv = Switch_UniformMatrix4x2fv;
+            UniformMatrix3x4fv = Switch_UniformMatrix3x4fv;
+            UniformMatrix4x3fv = Switch_UniformMatrix4x3fv;
+
+            // Extension: ARB_vertex_array_object
+            BindVertexArray = Switch_BindVertexArray;
+            DeleteVertexArrays = Switch_DeleteVertexArrays;
+            GenVertexArrays = Switch_GenVertexArrays;
+            IsVertexArray = Switch_IsVertexArray;
+
+            // Extension: ARB_map_buffer_range
+            MapBufferRange = Switch_MapBufferRange;
+            FlushMappedBufferRange = Switch_FlushMappedBufferRange;
+
+            // Extension: ARB_framebuffer_object
+            IsRenderbuffer = Switch_IsRenderbuffer;
+            BindRenderbuffer = Switch_BindRenderbuffer;
+            DeleteRenderbuffers = Switch_DeleteRenderbuffers;
+            GenRenderbuffers = Switch_GenRenderbuffers;
+            RenderbufferStorage = Switch_RenderbufferStorage;
+            GetRenderbufferParameteriv = Switch_GetRenderbufferParameteriv;
+            IsFramebuffer = Switch_IsFramebuffer;
+            BindFramebuffer = Switch_BindFramebuffer;
+            DeleteFramebuffers = Switch_DeleteFramebuffers;
+            GenFramebuffers = Switch_GenFramebuffers;
+            CheckFramebufferStatus = Switch_CheckFramebufferStatus;
+            FramebufferTexture1D = Switch_FramebufferTexture1D;
+            FramebufferTexture2D = Switch_FramebufferTexture2D;
+            FramebufferTexture3D = Switch_FramebufferTexture3D;
+            FramebufferRenderbuffer = Switch_FramebufferRenderbuffer;
+            GetFramebufferAttachmentParameteriv = Switch_GetFramebufferAttachmentParameteriv;
+            GenerateMipmap = Switch_GenerateMipmap;
+            BlitFramebuffer = Switch_BlitFramebuffer;
+            RenderbufferStorageMultisample = Switch_RenderbufferStorageMultisample;
+            FramebufferTextureLayer = Switch_FramebufferTextureLayer;
+
+            // Extension: 3.0
+            ColorMaski = Switch_ColorMaski;
+            GetBooleani_v = Switch_GetBooleani_v;
+            GetIntegeri_v = Switch_GetIntegeri_v;
+            Enablei = Switch_Enablei;
+            Disablei = Switch_Disablei;
+            IsEnabledi = Switch_IsEnabledi;
+            BeginTransformFeedback = Switch_BeginTransformFeedback;
+            EndTransformFeedback = Switch_EndTransformFeedback;
+            BindBufferRange = Switch_BindBufferRange;
+            BindBufferBase = Switch_BindBufferBase;
+            TransformFeedbackVaryings = Switch_TransformFeedbackVaryings;
+            GetTransformFeedbackVarying = Switch_GetTransformFeedbackVarying;
+            ClampColor = Switch_ClampColor;
+            BeginConditionalRender = Switch_BeginConditionalRender;
+            EndConditionalRender = Switch_EndConditionalRender;
+            VertexAttribIPointer = Switch_VertexAttribIPointer;
+            GetVertexAttribIiv = Switch_GetVertexAttribIiv;
+            GetVertexAttribIuiv = Switch_GetVertexAttribIuiv;
+            VertexAttribI1i = Switch_VertexAttribI1i;
+            VertexAttribI2i = Switch_VertexAttribI2i;
+            VertexAttribI3i = Switch_VertexAttribI3i;
+            VertexAttribI4i = Switch_VertexAttribI4i;
+            VertexAttribI1ui = Switch_VertexAttribI1ui;
+            VertexAttribI2ui = Switch_VertexAttribI2ui;
+            VertexAttribI3ui = Switch_VertexAttribI3ui;
+            VertexAttribI4ui = Switch_VertexAttribI4ui;
+            VertexAttribI1iv = Switch_VertexAttribI1iv;
+            VertexAttribI2iv = Switch_VertexAttribI2iv;
+            VertexAttribI3iv = Switch_VertexAttribI3iv;
+            VertexAttribI4iv = Switch_VertexAttribI4iv;
+            VertexAttribI1uiv = Switch_VertexAttribI1uiv;
+            VertexAttribI2uiv = Switch_VertexAttribI2uiv;
+            VertexAttribI3uiv = Switch_VertexAttribI3uiv;
+            VertexAttribI4uiv = Switch_VertexAttribI4uiv;
+            VertexAttribI4bv = Switch_VertexAttribI4bv;
+            VertexAttribI4sv = Switch_VertexAttribI4sv;
+            VertexAttribI4ubv = Switch_VertexAttribI4ubv;
+            VertexAttribI4usv = Switch_VertexAttribI4usv;
+            GetUniformuiv = Switch_GetUniformuiv;
+            BindFragDataLocation = Switch_BindFragDataLocation;
+            GetFragDataLocation = Switch_GetFragDataLocation;
+            Uniform1ui = Switch_Uniform1ui;
+            Uniform2ui = Switch_Uniform2ui;
+            Uniform3ui = Switch_Uniform3ui;
+            Uniform4ui = Switch_Uniform4ui;
+            Uniform1uiv = Switch_Uniform1uiv;
+            Uniform2uiv = Switch_Uniform2uiv;
+            Uniform3uiv = Switch_Uniform3uiv;
+            Uniform4uiv = Switch_Uniform4uiv;
+            TexParameterIiv = Switch_TexParameterIiv;
+            TexParameterIuiv = Switch_TexParameterIuiv;
+            GetTexParameterIiv = Switch_GetTexParameterIiv;
+            GetTexParameterIuiv = Switch_GetTexParameterIuiv;
+            ClearBufferiv = Switch_ClearBufferiv;
+            ClearBufferuiv = Switch_ClearBufferuiv;
+            ClearBufferfv = Switch_ClearBufferfv;
+            ClearBufferfi = Switch_ClearBufferfi;
+            GetStringi = Switch_GetStringi;
+
+            // Extension: ARB_uniform_buffer_object
+            GetUniformIndices = Switch_GetUniformIndices;
+            GetActiveUniformsiv = Switch_GetActiveUniformsiv;
+            GetActiveUniformName = Switch_GetActiveUniformName;
+            GetUniformBlockIndex = Switch_GetUniformBlockIndex;
+            GetActiveUniformBlockiv = Switch_GetActiveUniformBlockiv;
+            GetActiveUniformBlockName = Switch_GetActiveUniformBlockName;
+            UniformBlockBinding = Switch_UniformBlockBinding;
+
+            // Extension: ARB_copy_buffer
+            CopyBufferSubData = Switch_CopyBufferSubData;
+
+            // Extension: 3.1
+            DrawArraysInstanced = Switch_DrawArraysInstanced;
+            DrawElementsInstanced = Switch_DrawElementsInstanced;
+            TexBuffer = Switch_TexBuffer;
+            PrimitiveRestartIndex = Switch_PrimitiveRestartIndex;
+
+            // Legacy
+            EnableClientState = Switch_EnableClientState;
+            DisableClientState = Switch_DisableClientState;
+            VertexPointer = Switch_VertexPointer;
+            NormalPointer = Switch_NormalPointer;
+            ColorPointer = Switch_ColorPointer;
+            TexCoordPointer = Switch_TexCoordPointer;
+            TexEnvi = Switch_TexEnvi;
+            MatrixMode = Switch_MatrixMode;
+            LoadIdentity = Switch_LoadIdentity;
+            Ortho = Switch_Ortho;
+            Color3d = Switch_Color3d;
+        }
+    };
+
+    InitializeVariables g_initVariables;
+}
diff --git a/modules/core/src/gl_core_3_1.hpp b/modules/core/src/gl_core_3_1.hpp
new file mode 100644
index 0000000000..50dbee66c7
--- /dev/null
+++ b/modules/core/src/gl_core_3_1.hpp
@@ -0,0 +1,1331 @@
+#ifndef OPENGL_NOLOAD_STYLE_HPP
+#define OPENGL_NOLOAD_STYLE_HPP
+
+#if defined(__gl_h_) || defined(__GL_H__)
+#error Attempt to include auto-generated header after including gl.h
+#endif
+#if defined(__glext_h_) || defined(__GLEXT_H_)
+#error Attempt to include auto-generated header after including glext.h
+#endif
+#if defined(__gl_ATI_h_)
+#error Attempt to include auto-generated header after including glATI.h
+#endif
+
+#define __gl_h_
+#define __GL_H__
+#define __glext_h_
+#define __GLEXT_H_
+#define __gl_ATI_h_
+
+#ifndef APIENTRY
+    #if defined(__MINGW32__)
+        #ifndef WIN32_LEAN_AND_MEAN
+            #define WIN32_LEAN_AND_MEAN 1
+        #endif
+        #ifndef NOMINMAX
+            #define NOMINMAX
+        #endif
+        #include <windows.h>
+    #elif (defined(_MSC_VER) && _MSC_VER >= 800) || defined(_STDCALL_SUPPORTED) || defined(__BORLANDC__)
+        #ifndef WIN32_LEAN_AND_MEAN
+            #define WIN32_LEAN_AND_MEAN 1
+        #endif
+        #ifndef NOMINMAX
+            #define NOMINMAX
+        #endif
+        #include <windows.h>
+    #else
+        #define APIENTRY
+    #endif
+#endif // APIENTRY
+
+#ifndef CODEGEN_FUNCPTR
+    #define CODEGEN_REMOVE_FUNCPTR
+    #if defined(_WIN32)
+        #define CODEGEN_FUNCPTR APIENTRY
+    #else
+        #define CODEGEN_FUNCPTR
+    #endif
+#endif // CODEGEN_FUNCPTR
+
+#ifndef GL_LOAD_GEN_BASIC_OPENGL_TYPEDEFS
+#define GL_LOAD_GEN_BASIC_OPENGL_TYPEDEFS
+    typedef unsigned int GLenum;
+    typedef unsigned char GLboolean;
+    typedef unsigned int GLbitfield;
+    typedef signed char GLbyte;
+    typedef short GLshort;
+    typedef int GLint;
+    typedef int GLsizei;
+    typedef unsigned char GLubyte;
+    typedef unsigned short GLushort;
+    typedef unsigned int GLuint;
+    typedef float GLfloat;
+    typedef float GLclampf;
+    typedef double GLdouble;
+    typedef double GLclampd;
+    #define GLvoid void
+#endif // GL_LOAD_GEN_BASIC_OPENGL_TYPEDEFS
+
+#include <stddef.h>
+
+#ifndef GL_VERSION_2_0
+    // GL type for program/shader text
+    typedef char GLchar;
+#endif
+
+#ifndef GL_VERSION_1_5
+    // GL types for handling large vertex buffer objects
+    typedef ptrdiff_t GLintptr;
+    typedef ptrdiff_t GLsizeiptr;
+#endif
+
+#ifndef GL_ARB_vertex_buffer_object
+    // GL types for handling large vertex buffer objects
+    typedef ptrdiff_t GLintptrARB;
+    typedef ptrdiff_t GLsizeiptrARB;
+#endif
+
+#ifndef GL_ARB_shader_objects
+    // GL types for program/shader text and shader object handles
+    typedef char GLcharARB;
+    typedef unsigned int GLhandleARB;
+#endif
+
+// GL type for "half" precision (s10e5) float data in host memory
+#ifndef GL_ARB_half_float_pixel
+    typedef unsigned short GLhalfARB;
+#endif
+#ifndef GL_NV_half_float
+    typedef unsigned short GLhalfNV;
+#endif
+
+#ifndef GLEXT_64_TYPES_DEFINED
+    // This code block is duplicated in glxext.h, so must be protected
+    #define GLEXT_64_TYPES_DEFINED
+
+    // Define int32_t, int64_t, and uint64_t types for UST/MSC
+    // (as used in the GL_EXT_timer_query extension)
+    #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+        #include <inttypes.h>
+    #elif defined(__sun__) || defined(__digital__)
+        #include <inttypes.h>
+        #if defined(__STDC__)
+            #if defined(__arch64__) || defined(_LP64)
+                typedef long int int64_t;
+                typedef unsigned long int uint64_t;
+            #else
+                typedef long long int int64_t;
+                typedef unsigned long long int uint64_t;
+            #endif // __arch64__
+        #endif // __STDC__
+    #elif defined( __VMS ) || defined(__sgi)
+        #include <inttypes.h>
+    #elif defined(__SCO__) || defined(__USLC__)
+        #include <stdint.h>
+    #elif defined(__UNIXOS2__) || defined(__SOL64__)
+        typedef long int int32_t;
+        typedef long long int int64_t;
+        typedef unsigned long long int uint64_t;
+    #elif defined(_WIN32) && defined(__GNUC__)
+        #include <stdint.h>
+    #elif defined(_WIN32)
+        typedef __int32 int32_t;
+        typedef __int64 int64_t;
+        typedef unsigned __int64 uint64_t;
+    #else
+        // Fallback if nothing above works
+        #include <inttypes.h>
+    #endif
+#endif
+
+#ifndef GL_EXT_timer_query
+    typedef int64_t GLint64EXT;
+    typedef uint64_t GLuint64EXT;
+#endif
+
+#ifndef GL_ARB_sync
+    typedef int64_t GLint64;
+    typedef uint64_t GLuint64;
+    typedef struct __GLsync *GLsync;
+#endif
+
+#ifndef GL_ARB_cl_event
+    // These incomplete types let us declare types compatible with OpenCL's cl_context and cl_event
+    struct _cl_context;
+    struct _cl_event;
+#endif
+
+#ifndef GL_ARB_debug_output
+    typedef void (APIENTRY *GLDEBUGPROCARB)(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,GLvoid *userParam);
+#endif
+
+#ifndef GL_AMD_debug_output
+    typedef void (APIENTRY *GLDEBUGPROCAMD)(GLuint id,GLenum category,GLenum severity,GLsizei length,const GLchar *message,GLvoid *userParam);
+#endif
+
+#ifndef GL_KHR_debug
+    typedef void (APIENTRY *GLDEBUGPROC)(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,GLvoid *userParam);
+#endif
+
+#ifndef GL_NV_vdpau_interop
+    typedef GLintptr GLvdpauSurfaceNV;
+#endif
+
+namespace gl
+{
+    enum
+    {
+        // Version: 1.1
+        DEPTH_BUFFER_BIT                 = 0x00000100,
+        STENCIL_BUFFER_BIT               = 0x00000400,
+        COLOR_BUFFER_BIT                 = 0x00004000,
+        FALSE_                           = 0,
+        TRUE_                            = 1,
+        POINTS                           = 0x0000,
+        LINES                            = 0x0001,
+        LINE_LOOP                        = 0x0002,
+        LINE_STRIP                       = 0x0003,
+        TRIANGLES                        = 0x0004,
+        TRIANGLE_STRIP                   = 0x0005,
+        TRIANGLE_FAN                     = 0x0006,
+        QUADS                            = 0x0007,
+        NEVER                            = 0x0200,
+        LESS                             = 0x0201,
+        EQUAL                            = 0x0202,
+        LEQUAL                           = 0x0203,
+        GREATER                          = 0x0204,
+        NOTEQUAL                         = 0x0205,
+        GEQUAL                           = 0x0206,
+        ALWAYS                           = 0x0207,
+        ZERO                             = 0,
+        ONE                              = 1,
+        SRC_COLOR                        = 0x0300,
+        ONE_MINUS_SRC_COLOR              = 0x0301,
+        SRC_ALPHA                        = 0x0302,
+        ONE_MINUS_SRC_ALPHA              = 0x0303,
+        DST_ALPHA                        = 0x0304,
+        ONE_MINUS_DST_ALPHA              = 0x0305,
+        DST_COLOR                        = 0x0306,
+        ONE_MINUS_DST_COLOR              = 0x0307,
+        SRC_ALPHA_SATURATE               = 0x0308,
+        NONE                             = 0,
+        FRONT_LEFT                       = 0x0400,
+        FRONT_RIGHT                      = 0x0401,
+        BACK_LEFT                        = 0x0402,
+        BACK_RIGHT                       = 0x0403,
+        FRONT                            = 0x0404,
+        BACK                             = 0x0405,
+        LEFT                             = 0x0406,
+        RIGHT                            = 0x0407,
+        FRONT_AND_BACK                   = 0x0408,
+        NO_ERROR_                        = 0,
+        INVALID_ENUM                     = 0x0500,
+        INVALID_VALUE                    = 0x0501,
+        INVALID_OPERATION                = 0x0502,
+        OUT_OF_MEMORY                    = 0x0505,
+        CW                               = 0x0900,
+        CCW                              = 0x0901,
+        POINT_SIZE                       = 0x0B11,
+        POINT_SIZE_RANGE                 = 0x0B12,
+        POINT_SIZE_GRANULARITY           = 0x0B13,
+        LINE_SMOOTH                      = 0x0B20,
+        LINE_WIDTH                       = 0x0B21,
+        LINE_WIDTH_RANGE                 = 0x0B22,
+        LINE_WIDTH_GRANULARITY           = 0x0B23,
+        POLYGON_MODE                     = 0x0B40,
+        POLYGON_SMOOTH                   = 0x0B41,
+        CULL_FACE                        = 0x0B44,
+        CULL_FACE_MODE                   = 0x0B45,
+        FRONT_FACE                       = 0x0B46,
+        DEPTH_RANGE                      = 0x0B70,
+        DEPTH_TEST                       = 0x0B71,
+        DEPTH_WRITEMASK                  = 0x0B72,
+        DEPTH_CLEAR_VALUE                = 0x0B73,
+        DEPTH_FUNC                       = 0x0B74,
+        STENCIL_TEST                     = 0x0B90,
+        STENCIL_CLEAR_VALUE              = 0x0B91,
+        STENCIL_FUNC                     = 0x0B92,
+        STENCIL_VALUE_MASK               = 0x0B93,
+        STENCIL_FAIL                     = 0x0B94,
+        STENCIL_PASS_DEPTH_FAIL          = 0x0B95,
+        STENCIL_PASS_DEPTH_PASS          = 0x0B96,
+        STENCIL_REF                      = 0x0B97,
+        STENCIL_WRITEMASK                = 0x0B98,
+        VIEWPORT                         = 0x0BA2,
+        DITHER                           = 0x0BD0,
+        BLEND_DST                        = 0x0BE0,
+        BLEND_SRC                        = 0x0BE1,
+        BLEND                            = 0x0BE2,
+        LOGIC_OP_MODE                    = 0x0BF0,
+        COLOR_LOGIC_OP                   = 0x0BF2,
+        DRAW_BUFFER                      = 0x0C01,
+        READ_BUFFER                      = 0x0C02,
+        SCISSOR_BOX                      = 0x0C10,
+        SCISSOR_TEST                     = 0x0C11,
+        COLOR_CLEAR_VALUE                = 0x0C22,
+        COLOR_WRITEMASK                  = 0x0C23,
+        DOUBLEBUFFER                     = 0x0C32,
+        STEREO                           = 0x0C33,
+        LINE_SMOOTH_HINT                 = 0x0C52,
+        POLYGON_SMOOTH_HINT              = 0x0C53,
+        UNPACK_SWAP_BYTES                = 0x0CF0,
+        UNPACK_LSB_FIRST                 = 0x0CF1,
+        UNPACK_ROW_LENGTH                = 0x0CF2,
+        UNPACK_SKIP_ROWS                 = 0x0CF3,
+        UNPACK_SKIP_PIXELS               = 0x0CF4,
+        UNPACK_ALIGNMENT                 = 0x0CF5,
+        PACK_SWAP_BYTES                  = 0x0D00,
+        PACK_LSB_FIRST                   = 0x0D01,
+        PACK_ROW_LENGTH                  = 0x0D02,
+        PACK_SKIP_ROWS                   = 0x0D03,
+        PACK_SKIP_PIXELS                 = 0x0D04,
+        PACK_ALIGNMENT                   = 0x0D05,
+        MAX_TEXTURE_SIZE                 = 0x0D33,
+        MAX_VIEWPORT_DIMS                = 0x0D3A,
+        SUBPIXEL_BITS                    = 0x0D50,
+        TEXTURE_1D                       = 0x0DE0,
+        TEXTURE_2D                       = 0x0DE1,
+        POLYGON_OFFSET_UNITS             = 0x2A00,
+        POLYGON_OFFSET_POINT             = 0x2A01,
+        POLYGON_OFFSET_LINE              = 0x2A02,
+        POLYGON_OFFSET_FILL              = 0x8037,
+        POLYGON_OFFSET_FACTOR            = 0x8038,
+        TEXTURE_BINDING_1D               = 0x8068,
+        TEXTURE_BINDING_2D               = 0x8069,
+        TEXTURE_WIDTH                    = 0x1000,
+        TEXTURE_HEIGHT                   = 0x1001,
+        TEXTURE_INTERNAL_FORMAT          = 0x1003,
+        TEXTURE_BORDER_COLOR             = 0x1004,
+        TEXTURE_RED_SIZE                 = 0x805C,
+        TEXTURE_GREEN_SIZE               = 0x805D,
+        TEXTURE_BLUE_SIZE                = 0x805E,
+        TEXTURE_ALPHA_SIZE               = 0x805F,
+        DONT_CARE                        = 0x1100,
+        FASTEST                          = 0x1101,
+        NICEST                           = 0x1102,
+        BYTE                             = 0x1400,
+        UNSIGNED_BYTE                    = 0x1401,
+        SHORT                            = 0x1402,
+        UNSIGNED_SHORT                   = 0x1403,
+        INT                              = 0x1404,
+        UNSIGNED_INT                     = 0x1405,
+        FLOAT                            = 0x1406,
+        DOUBLE                           = 0x140A,
+        CLEAR                            = 0x1500,
+        AND                              = 0x1501,
+        AND_REVERSE                      = 0x1502,
+        COPY                             = 0x1503,
+        AND_INVERTED                     = 0x1504,
+        NOOP                             = 0x1505,
+        XOR                              = 0x1506,
+        OR                               = 0x1507,
+        NOR                              = 0x1508,
+        EQUIV                            = 0x1509,
+        INVERT                           = 0x150A,
+        OR_REVERSE                       = 0x150B,
+        COPY_INVERTED                    = 0x150C,
+        OR_INVERTED                      = 0x150D,
+        NAND                             = 0x150E,
+        SET                              = 0x150F,
+        TEXTURE                          = 0x1702,
+        COLOR                            = 0x1800,
+        DEPTH                            = 0x1801,
+        STENCIL                          = 0x1802,
+        STENCIL_INDEX                    = 0x1901,
+        DEPTH_COMPONENT                  = 0x1902,
+        RED                              = 0x1903,
+        GREEN                            = 0x1904,
+        BLUE                             = 0x1905,
+        ALPHA                            = 0x1906,
+        RGB                              = 0x1907,
+        RGBA                             = 0x1908,
+        POINT                            = 0x1B00,
+        LINE                             = 0x1B01,
+        FILL                             = 0x1B02,
+        KEEP                             = 0x1E00,
+        REPLACE                          = 0x1E01,
+        INCR                             = 0x1E02,
+        DECR                             = 0x1E03,
+        VENDOR                           = 0x1F00,
+        RENDERER                         = 0x1F01,
+        VERSION_                         = 0x1F02,
+        EXTENSIONS                       = 0x1F03,
+        NEAREST                          = 0x2600,
+        LINEAR                           = 0x2601,
+        NEAREST_MIPMAP_NEAREST           = 0x2700,
+        LINEAR_MIPMAP_NEAREST            = 0x2701,
+        NEAREST_MIPMAP_LINEAR            = 0x2702,
+        LINEAR_MIPMAP_LINEAR             = 0x2703,
+        TEXTURE_MAG_FILTER               = 0x2800,
+        TEXTURE_MIN_FILTER               = 0x2801,
+        TEXTURE_WRAP_S                   = 0x2802,
+        TEXTURE_WRAP_T                   = 0x2803,
+        PROXY_TEXTURE_1D                 = 0x8063,
+        PROXY_TEXTURE_2D                 = 0x8064,
+        REPEAT                           = 0x2901,
+        R3_G3_B2                         = 0x2A10,
+        RGB4                             = 0x804F,
+        RGB5                             = 0x8050,
+        RGB8                             = 0x8051,
+        RGB10                            = 0x8052,
+        RGB12                            = 0x8053,
+        RGB16                            = 0x8054,
+        RGBA2                            = 0x8055,
+        RGBA4                            = 0x8056,
+        RGB5_A1                          = 0x8057,
+        RGBA8                            = 0x8058,
+        RGB10_A2                         = 0x8059,
+        RGBA12                           = 0x805A,
+        RGBA16                           = 0x805B,
+
+        // Core Extension: ARB_imaging
+        CONSTANT_COLOR                   = 0x8001,
+        ONE_MINUS_CONSTANT_COLOR         = 0x8002,
+        CONSTANT_ALPHA                   = 0x8003,
+        ONE_MINUS_CONSTANT_ALPHA         = 0x8004,
+        BLEND_COLOR                      = 0x8005,
+        FUNC_ADD                         = 0x8006,
+        MIN                              = 0x8007,
+        MAX                              = 0x8008,
+        BLEND_EQUATION                   = 0x8009,
+        FUNC_SUBTRACT                    = 0x800A,
+        FUNC_REVERSE_SUBTRACT            = 0x800B,
+        CONVOLUTION_1D                   = 0x8010,
+        CONVOLUTION_2D                   = 0x8011,
+        SEPARABLE_2D                     = 0x8012,
+        CONVOLUTION_BORDER_MODE          = 0x8013,
+        CONVOLUTION_FILTER_SCALE         = 0x8014,
+        CONVOLUTION_FILTER_BIAS          = 0x8015,
+        REDUCE                           = 0x8016,
+        CONVOLUTION_FORMAT               = 0x8017,
+        CONVOLUTION_WIDTH                = 0x8018,
+        CONVOLUTION_HEIGHT               = 0x8019,
+        MAX_CONVOLUTION_WIDTH            = 0x801A,
+        MAX_CONVOLUTION_HEIGHT           = 0x801B,
+        POST_CONVOLUTION_RED_SCALE       = 0x801C,
+        POST_CONVOLUTION_GREEN_SCALE     = 0x801D,
+        POST_CONVOLUTION_BLUE_SCALE      = 0x801E,
+        POST_CONVOLUTION_ALPHA_SCALE     = 0x801F,
+        POST_CONVOLUTION_RED_BIAS        = 0x8020,
+        POST_CONVOLUTION_GREEN_BIAS      = 0x8021,
+        POST_CONVOLUTION_BLUE_BIAS       = 0x8022,
+        POST_CONVOLUTION_ALPHA_BIAS      = 0x8023,
+        HISTOGRAM                        = 0x8024,
+        PROXY_HISTOGRAM                  = 0x8025,
+        HISTOGRAM_WIDTH                  = 0x8026,
+        HISTOGRAM_FORMAT                 = 0x8027,
+        HISTOGRAM_RED_SIZE               = 0x8028,
+        HISTOGRAM_GREEN_SIZE             = 0x8029,
+        HISTOGRAM_BLUE_SIZE              = 0x802A,
+        HISTOGRAM_ALPHA_SIZE             = 0x802B,
+        HISTOGRAM_LUMINANCE_SIZE         = 0x802C,
+        HISTOGRAM_SINK                   = 0x802D,
+        MINMAX                           = 0x802E,
+        MINMAX_FORMAT                    = 0x802F,
+        MINMAX_SINK                      = 0x8030,
+        TABLE_TOO_LARGE                  = 0x8031,
+        COLOR_MATRIX                     = 0x80B1,
+        COLOR_MATRIX_STACK_DEPTH         = 0x80B2,
+        MAX_COLOR_MATRIX_STACK_DEPTH     = 0x80B3,
+        POST_COLOR_MATRIX_RED_SCALE      = 0x80B4,
+        POST_COLOR_MATRIX_GREEN_SCALE    = 0x80B5,
+        POST_COLOR_MATRIX_BLUE_SCALE     = 0x80B6,
+        POST_COLOR_MATRIX_ALPHA_SCALE    = 0x80B7,
+        POST_COLOR_MATRIX_RED_BIAS       = 0x80B8,
+        POST_COLOR_MATRIX_GREEN_BIAS     = 0x80B9,
+        POST_COLOR_MATRIX_BLUE_BIAS      = 0x80BA,
+        POST_COLOR_MATRIX_ALPHA_BIAS     = 0x80BB,
+        COLOR_TABLE                      = 0x80D0,
+        POST_CONVOLUTION_COLOR_TABLE     = 0x80D1,
+        POST_COLOR_MATRIX_COLOR_TABLE    = 0x80D2,
+        PROXY_COLOR_TABLE                = 0x80D3,
+        PROXY_POST_CONVOLUTION_COLOR_TABLE = 0x80D4,
+        PROXY_POST_COLOR_MATRIX_COLOR_TABLE = 0x80D5,
+        COLOR_TABLE_SCALE                = 0x80D6,
+        COLOR_TABLE_BIAS                 = 0x80D7,
+        COLOR_TABLE_FORMAT               = 0x80D8,
+        COLOR_TABLE_WIDTH                = 0x80D9,
+        COLOR_TABLE_RED_SIZE             = 0x80DA,
+        COLOR_TABLE_GREEN_SIZE           = 0x80DB,
+        COLOR_TABLE_BLUE_SIZE            = 0x80DC,
+        COLOR_TABLE_ALPHA_SIZE           = 0x80DD,
+        COLOR_TABLE_LUMINANCE_SIZE       = 0x80DE,
+        COLOR_TABLE_INTENSITY_SIZE       = 0x80DF,
+        CONSTANT_BORDER                  = 0x8151,
+        REPLICATE_BORDER                 = 0x8153,
+        CONVOLUTION_BORDER_COLOR         = 0x8154,
+
+        // Version: 1.2
+        UNSIGNED_BYTE_3_3_2              = 0x8032,
+        UNSIGNED_SHORT_4_4_4_4           = 0x8033,
+        UNSIGNED_SHORT_5_5_5_1           = 0x8034,
+        UNSIGNED_INT_8_8_8_8             = 0x8035,
+        UNSIGNED_INT_10_10_10_2          = 0x8036,
+        TEXTURE_BINDING_3D               = 0x806A,
+        PACK_SKIP_IMAGES                 = 0x806B,
+        PACK_IMAGE_HEIGHT                = 0x806C,
+        UNPACK_SKIP_IMAGES               = 0x806D,
+        UNPACK_IMAGE_HEIGHT              = 0x806E,
+        TEXTURE_3D                       = 0x806F,
+        PROXY_TEXTURE_3D                 = 0x8070,
+        TEXTURE_DEPTH                    = 0x8071,
+        TEXTURE_WRAP_R                   = 0x8072,
+        MAX_3D_TEXTURE_SIZE              = 0x8073,
+        UNSIGNED_BYTE_2_3_3_REV          = 0x8362,
+        UNSIGNED_SHORT_5_6_5             = 0x8363,
+        UNSIGNED_SHORT_5_6_5_REV         = 0x8364,
+        UNSIGNED_SHORT_4_4_4_4_REV       = 0x8365,
+        UNSIGNED_SHORT_1_5_5_5_REV       = 0x8366,
+        UNSIGNED_INT_8_8_8_8_REV         = 0x8367,
+        UNSIGNED_INT_2_10_10_10_REV      = 0x8368,
+        BGR                              = 0x80E0,
+        BGRA                             = 0x80E1,
+        MAX_ELEMENTS_VERTICES            = 0x80E8,
+        MAX_ELEMENTS_INDICES             = 0x80E9,
+        CLAMP_TO_EDGE                    = 0x812F,
+        TEXTURE_MIN_LOD                  = 0x813A,
+        TEXTURE_MAX_LOD                  = 0x813B,
+        TEXTURE_BASE_LEVEL               = 0x813C,
+        TEXTURE_MAX_LEVEL                = 0x813D,
+        SMOOTH_POINT_SIZE_RANGE          = 0x0B12,
+        SMOOTH_POINT_SIZE_GRANULARITY    = 0x0B13,
+        SMOOTH_LINE_WIDTH_RANGE          = 0x0B22,
+        SMOOTH_LINE_WIDTH_GRANULARITY    = 0x0B23,
+        ALIASED_LINE_WIDTH_RANGE         = 0x846E,
+
+        // Version: 1.3
+        TEXTURE0                         = 0x84C0,
+        TEXTURE1                         = 0x84C1,
+        TEXTURE2                         = 0x84C2,
+        TEXTURE3                         = 0x84C3,
+        TEXTURE4                         = 0x84C4,
+        TEXTURE5                         = 0x84C5,
+        TEXTURE6                         = 0x84C6,
+        TEXTURE7                         = 0x84C7,
+        TEXTURE8                         = 0x84C8,
+        TEXTURE9                         = 0x84C9,
+        TEXTURE10                        = 0x84CA,
+        TEXTURE11                        = 0x84CB,
+        TEXTURE12                        = 0x84CC,
+        TEXTURE13                        = 0x84CD,
+        TEXTURE14                        = 0x84CE,
+        TEXTURE15                        = 0x84CF,
+        TEXTURE16                        = 0x84D0,
+        TEXTURE17                        = 0x84D1,
+        TEXTURE18                        = 0x84D2,
+        TEXTURE19                        = 0x84D3,
+        TEXTURE20                        = 0x84D4,
+        TEXTURE21                        = 0x84D5,
+        TEXTURE22                        = 0x84D6,
+        TEXTURE23                        = 0x84D7,
+        TEXTURE24                        = 0x84D8,
+        TEXTURE25                        = 0x84D9,
+        TEXTURE26                        = 0x84DA,
+        TEXTURE27                        = 0x84DB,
+        TEXTURE28                        = 0x84DC,
+        TEXTURE29                        = 0x84DD,
+        TEXTURE30                        = 0x84DE,
+        TEXTURE31                        = 0x84DF,
+        ACTIVE_TEXTURE                   = 0x84E0,
+        MULTISAMPLE                      = 0x809D,
+        SAMPLE_ALPHA_TO_COVERAGE         = 0x809E,
+        SAMPLE_ALPHA_TO_ONE              = 0x809F,
+        SAMPLE_COVERAGE                  = 0x80A0,
+        SAMPLE_BUFFERS                   = 0x80A8,
+        SAMPLES                          = 0x80A9,
+        SAMPLE_COVERAGE_VALUE            = 0x80AA,
+        SAMPLE_COVERAGE_INVERT           = 0x80AB,
+        TEXTURE_CUBE_MAP                 = 0x8513,
+        TEXTURE_BINDING_CUBE_MAP         = 0x8514,
+        TEXTURE_CUBE_MAP_POSITIVE_X      = 0x8515,
+        TEXTURE_CUBE_MAP_NEGATIVE_X      = 0x8516,
+        TEXTURE_CUBE_MAP_POSITIVE_Y      = 0x8517,
+        TEXTURE_CUBE_MAP_NEGATIVE_Y      = 0x8518,
+        TEXTURE_CUBE_MAP_POSITIVE_Z      = 0x8519,
+        TEXTURE_CUBE_MAP_NEGATIVE_Z      = 0x851A,
+        PROXY_TEXTURE_CUBE_MAP           = 0x851B,
+        MAX_CUBE_MAP_TEXTURE_SIZE        = 0x851C,
+        COMPRESSED_RGB                   = 0x84ED,
+        COMPRESSED_RGBA                  = 0x84EE,
+        TEXTURE_COMPRESSION_HINT         = 0x84EF,
+        TEXTURE_COMPRESSED_IMAGE_SIZE    = 0x86A0,
+        TEXTURE_COMPRESSED               = 0x86A1,
+        NUM_COMPRESSED_TEXTURE_FORMATS   = 0x86A2,
+        COMPRESSED_TEXTURE_FORMATS       = 0x86A3,
+        CLAMP_TO_BORDER                  = 0x812D,
+
+        // Version: 1.4
+        BLEND_DST_RGB                    = 0x80C8,
+        BLEND_SRC_RGB                    = 0x80C9,
+        BLEND_DST_ALPHA                  = 0x80CA,
+        BLEND_SRC_ALPHA                  = 0x80CB,
+        POINT_FADE_THRESHOLD_SIZE        = 0x8128,
+        DEPTH_COMPONENT16                = 0x81A5,
+        DEPTH_COMPONENT24                = 0x81A6,
+        DEPTH_COMPONENT32                = 0x81A7,
+        MIRRORED_REPEAT                  = 0x8370,
+        MAX_TEXTURE_LOD_BIAS             = 0x84FD,
+        TEXTURE_LOD_BIAS                 = 0x8501,
+        INCR_WRAP                        = 0x8507,
+        DECR_WRAP                        = 0x8508,
+        TEXTURE_DEPTH_SIZE               = 0x884A,
+        TEXTURE_COMPARE_MODE             = 0x884C,
+        TEXTURE_COMPARE_FUNC             = 0x884D,
+
+        // Version: 1.5
+        BUFFER_SIZE                      = 0x8764,
+        BUFFER_USAGE                     = 0x8765,
+        QUERY_COUNTER_BITS               = 0x8864,
+        CURRENT_QUERY                    = 0x8865,
+        QUERY_RESULT                     = 0x8866,
+        QUERY_RESULT_AVAILABLE           = 0x8867,
+        ARRAY_BUFFER                     = 0x8892,
+        ELEMENT_ARRAY_BUFFER             = 0x8893,
+        ARRAY_BUFFER_BINDING             = 0x8894,
+        ELEMENT_ARRAY_BUFFER_BINDING     = 0x8895,
+        VERTEX_ATTRIB_ARRAY_BUFFER_BINDING = 0x889F,
+        READ_ONLY                        = 0x88B8,
+        WRITE_ONLY                       = 0x88B9,
+        READ_WRITE                       = 0x88BA,
+        BUFFER_ACCESS                    = 0x88BB,
+        BUFFER_MAPPED                    = 0x88BC,
+        BUFFER_MAP_POINTER               = 0x88BD,
+        STREAM_DRAW                      = 0x88E0,
+        STREAM_READ                      = 0x88E1,
+        STREAM_COPY                      = 0x88E2,
+        STATIC_DRAW                      = 0x88E4,
+        STATIC_READ                      = 0x88E5,
+        STATIC_COPY                      = 0x88E6,
+        DYNAMIC_DRAW                     = 0x88E8,
+        DYNAMIC_READ                     = 0x88E9,
+        DYNAMIC_COPY                     = 0x88EA,
+        SAMPLES_PASSED                   = 0x8914,
+        SRC1_ALPHA                       = 0x8589,
+
+        // Version: 2.0
+        BLEND_EQUATION_RGB               = 0x8009,
+        VERTEX_ATTRIB_ARRAY_ENABLED      = 0x8622,
+        VERTEX_ATTRIB_ARRAY_SIZE         = 0x8623,
+        VERTEX_ATTRIB_ARRAY_STRIDE       = 0x8624,
+        VERTEX_ATTRIB_ARRAY_TYPE         = 0x8625,
+        CURRENT_VERTEX_ATTRIB            = 0x8626,
+        VERTEX_PROGRAM_POINT_SIZE        = 0x8642,
+        VERTEX_ATTRIB_ARRAY_POINTER      = 0x8645,
+        STENCIL_BACK_FUNC                = 0x8800,
+        STENCIL_BACK_FAIL                = 0x8801,
+        STENCIL_BACK_PASS_DEPTH_FAIL     = 0x8802,
+        STENCIL_BACK_PASS_DEPTH_PASS     = 0x8803,
+        MAX_DRAW_BUFFERS                 = 0x8824,
+        DRAW_BUFFER0                     = 0x8825,
+        DRAW_BUFFER1                     = 0x8826,
+        DRAW_BUFFER2                     = 0x8827,
+        DRAW_BUFFER3                     = 0x8828,
+        DRAW_BUFFER4                     = 0x8829,
+        DRAW_BUFFER5                     = 0x882A,
+        DRAW_BUFFER6                     = 0x882B,
+        DRAW_BUFFER7                     = 0x882C,
+        DRAW_BUFFER8                     = 0x882D,
+        DRAW_BUFFER9                     = 0x882E,
+        DRAW_BUFFER10                    = 0x882F,
+        DRAW_BUFFER11                    = 0x8830,
+        DRAW_BUFFER12                    = 0x8831,
+        DRAW_BUFFER13                    = 0x8832,
+        DRAW_BUFFER14                    = 0x8833,
+        DRAW_BUFFER15                    = 0x8834,
+        BLEND_EQUATION_ALPHA             = 0x883D,
+        MAX_VERTEX_ATTRIBS               = 0x8869,
+        VERTEX_ATTRIB_ARRAY_NORMALIZED   = 0x886A,
+        MAX_TEXTURE_IMAGE_UNITS          = 0x8872,
+        FRAGMENT_SHADER                  = 0x8B30,
+        VERTEX_SHADER                    = 0x8B31,
+        MAX_FRAGMENT_UNIFORM_COMPONENTS  = 0x8B49,
+        MAX_VERTEX_UNIFORM_COMPONENTS    = 0x8B4A,
+        MAX_VARYING_FLOATS               = 0x8B4B,
+        MAX_VERTEX_TEXTURE_IMAGE_UNITS   = 0x8B4C,
+        MAX_COMBINED_TEXTURE_IMAGE_UNITS = 0x8B4D,
+        SHADER_TYPE                      = 0x8B4F,
+        FLOAT_VEC2                       = 0x8B50,
+        FLOAT_VEC3                       = 0x8B51,
+        FLOAT_VEC4                       = 0x8B52,
+        INT_VEC2                         = 0x8B53,
+        INT_VEC3                         = 0x8B54,
+        INT_VEC4                         = 0x8B55,
+        BOOL                             = 0x8B56,
+        BOOL_VEC2                        = 0x8B57,
+        BOOL_VEC3                        = 0x8B58,
+        BOOL_VEC4                        = 0x8B59,
+        FLOAT_MAT2                       = 0x8B5A,
+        FLOAT_MAT3                       = 0x8B5B,
+        FLOAT_MAT4                       = 0x8B5C,
+        SAMPLER_1D                       = 0x8B5D,
+        SAMPLER_2D                       = 0x8B5E,
+        SAMPLER_3D                       = 0x8B5F,
+        SAMPLER_CUBE                     = 0x8B60,
+        SAMPLER_1D_SHADOW                = 0x8B61,
+        SAMPLER_2D_SHADOW                = 0x8B62,
+        DELETE_STATUS                    = 0x8B80,
+        COMPILE_STATUS                   = 0x8B81,
+        LINK_STATUS                      = 0x8B82,
+        VALIDATE_STATUS                  = 0x8B83,
+        INFO_LOG_LENGTH                  = 0x8B84,
+        ATTACHED_SHADERS                 = 0x8B85,
+        ACTIVE_UNIFORMS                  = 0x8B86,
+        ACTIVE_UNIFORM_MAX_LENGTH        = 0x8B87,
+        SHADER_SOURCE_LENGTH             = 0x8B88,
+        ACTIVE_ATTRIBUTES                = 0x8B89,
+        ACTIVE_ATTRIBUTE_MAX_LENGTH      = 0x8B8A,
+        FRAGMENT_SHADER_DERIVATIVE_HINT  = 0x8B8B,
+        SHADING_LANGUAGE_VERSION         = 0x8B8C,
+        CURRENT_PROGRAM                  = 0x8B8D,
+        POINT_SPRITE_COORD_ORIGIN        = 0x8CA0,
+        LOWER_LEFT                       = 0x8CA1,
+        UPPER_LEFT                       = 0x8CA2,
+        STENCIL_BACK_REF                 = 0x8CA3,
+        STENCIL_BACK_VALUE_MASK          = 0x8CA4,
+        STENCIL_BACK_WRITEMASK           = 0x8CA5,
+
+        // Version: 2.1
+        PIXEL_PACK_BUFFER                = 0x88EB,
+        PIXEL_UNPACK_BUFFER              = 0x88EC,
+        PIXEL_PACK_BUFFER_BINDING        = 0x88ED,
+        PIXEL_UNPACK_BUFFER_BINDING      = 0x88EF,
+        FLOAT_MAT2x3                     = 0x8B65,
+        FLOAT_MAT2x4                     = 0x8B66,
+        FLOAT_MAT3x2                     = 0x8B67,
+        FLOAT_MAT3x4                     = 0x8B68,
+        FLOAT_MAT4x2                     = 0x8B69,
+        FLOAT_MAT4x3                     = 0x8B6A,
+        SRGB                             = 0x8C40,
+        SRGB8                            = 0x8C41,
+        SRGB_ALPHA                       = 0x8C42,
+        SRGB8_ALPHA8                     = 0x8C43,
+        COMPRESSED_SRGB                  = 0x8C48,
+        COMPRESSED_SRGB_ALPHA            = 0x8C49,
+
+        // Core Extension: ARB_vertex_array_object
+        VERTEX_ARRAY_BINDING             = 0x85B5,
+
+        // Core Extension: ARB_texture_rg
+        RG                               = 0x8227,
+        RG_INTEGER                       = 0x8228,
+        R8                               = 0x8229,
+        R16                              = 0x822A,
+        RG8                              = 0x822B,
+        RG16                             = 0x822C,
+        R16F                             = 0x822D,
+        R32F                             = 0x822E,
+        RG16F                            = 0x822F,
+        RG32F                            = 0x8230,
+        R8I                              = 0x8231,
+        R8UI                             = 0x8232,
+        R16I                             = 0x8233,
+        R16UI                            = 0x8234,
+        R32I                             = 0x8235,
+        R32UI                            = 0x8236,
+        RG8I                             = 0x8237,
+        RG8UI                            = 0x8238,
+        RG16I                            = 0x8239,
+        RG16UI                           = 0x823A,
+        RG32I                            = 0x823B,
+        RG32UI                           = 0x823C,
+
+        // Core Extension: ARB_texture_compression_rgtc
+        COMPRESSED_RED_RGTC1             = 0x8DBB,
+        COMPRESSED_SIGNED_RED_RGTC1      = 0x8DBC,
+        COMPRESSED_RG_RGTC2              = 0x8DBD,
+        COMPRESSED_SIGNED_RG_RGTC2       = 0x8DBE,
+
+        // Core Extension: ARB_map_buffer_range
+        MAP_READ_BIT                     = 0x0001,
+        MAP_WRITE_BIT                    = 0x0002,
+        MAP_INVALIDATE_RANGE_BIT         = 0x0004,
+        MAP_INVALIDATE_BUFFER_BIT        = 0x0008,
+        MAP_FLUSH_EXPLICIT_BIT           = 0x0010,
+        MAP_UNSYNCHRONIZED_BIT           = 0x0020,
+
+        // Core Extension: ARB_half_float_vertex
+        HALF_FLOAT                       = 0x140B,
+
+        // Core Extension: ARB_framebuffer_sRGB
+        FRAMEBUFFER_SRGB                 = 0x8DB9,
+
+        // Core Extension: ARB_framebuffer_object
+        INVALID_FRAMEBUFFER_OPERATION    = 0x0506,
+        FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING = 0x8210,
+        FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE = 0x8211,
+        FRAMEBUFFER_ATTACHMENT_RED_SIZE  = 0x8212,
+        FRAMEBUFFER_ATTACHMENT_GREEN_SIZE = 0x8213,
+        FRAMEBUFFER_ATTACHMENT_BLUE_SIZE = 0x8214,
+        FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE = 0x8215,
+        FRAMEBUFFER_ATTACHMENT_DEPTH_SIZE = 0x8216,
+        FRAMEBUFFER_ATTACHMENT_STENCIL_SIZE = 0x8217,
+        FRAMEBUFFER_DEFAULT              = 0x8218,
+        FRAMEBUFFER_UNDEFINED            = 0x8219,
+        DEPTH_STENCIL_ATTACHMENT         = 0x821A,
+        INDEX                            = 0x8222,
+        MAX_RENDERBUFFER_SIZE            = 0x84E8,
+        DEPTH_STENCIL                    = 0x84F9,
+        UNSIGNED_INT_24_8                = 0x84FA,
+        DEPTH24_STENCIL8                 = 0x88F0,
+        TEXTURE_STENCIL_SIZE             = 0x88F1,
+        TEXTURE_RED_TYPE                 = 0x8C10,
+        TEXTURE_GREEN_TYPE               = 0x8C11,
+        TEXTURE_BLUE_TYPE                = 0x8C12,
+        TEXTURE_ALPHA_TYPE               = 0x8C13,
+        TEXTURE_DEPTH_TYPE               = 0x8C16,
+        UNSIGNED_NORMALIZED              = 0x8C17,
+        FRAMEBUFFER_BINDING              = 0x8CA6,
+        DRAW_FRAMEBUFFER_BINDING         = 0x8CA6,
+        RENDERBUFFER_BINDING             = 0x8CA7,
+        READ_FRAMEBUFFER                 = 0x8CA8,
+        DRAW_FRAMEBUFFER                 = 0x8CA9,
+        READ_FRAMEBUFFER_BINDING         = 0x8CAA,
+        RENDERBUFFER_SAMPLES             = 0x8CAB,
+        FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE = 0x8CD0,
+        FRAMEBUFFER_ATTACHMENT_OBJECT_NAME = 0x8CD1,
+        FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL = 0x8CD2,
+        FRAMEBUFFER_ATTACHMENT_TEXTURE_CUBE_MAP_FACE = 0x8CD3,
+        FRAMEBUFFER_ATTACHMENT_TEXTURE_LAYER = 0x8CD4,
+        FRAMEBUFFER_COMPLETE             = 0x8CD5,
+        FRAMEBUFFER_INCOMPLETE_ATTACHMENT = 0x8CD6,
+        FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT = 0x8CD7,
+        FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER = 0x8CDB,
+        FRAMEBUFFER_INCOMPLETE_READ_BUFFER = 0x8CDC,
+        FRAMEBUFFER_UNSUPPORTED          = 0x8CDD,
+        MAX_COLOR_ATTACHMENTS            = 0x8CDF,
+        COLOR_ATTACHMENT0                = 0x8CE0,
+        COLOR_ATTACHMENT1                = 0x8CE1,
+        COLOR_ATTACHMENT2                = 0x8CE2,
+        COLOR_ATTACHMENT3                = 0x8CE3,
+        COLOR_ATTACHMENT4                = 0x8CE4,
+        COLOR_ATTACHMENT5                = 0x8CE5,
+        COLOR_ATTACHMENT6                = 0x8CE6,
+        COLOR_ATTACHMENT7                = 0x8CE7,
+        COLOR_ATTACHMENT8                = 0x8CE8,
+        COLOR_ATTACHMENT9                = 0x8CE9,
+        COLOR_ATTACHMENT10               = 0x8CEA,
+        COLOR_ATTACHMENT11               = 0x8CEB,
+        COLOR_ATTACHMENT12               = 0x8CEC,
+        COLOR_ATTACHMENT13               = 0x8CED,
+        COLOR_ATTACHMENT14               = 0x8CEE,
+        COLOR_ATTACHMENT15               = 0x8CEF,
+        DEPTH_ATTACHMENT                 = 0x8D00,
+        STENCIL_ATTACHMENT               = 0x8D20,
+        FRAMEBUFFER                      = 0x8D40,
+        RENDERBUFFER                     = 0x8D41,
+        RENDERBUFFER_WIDTH               = 0x8D42,
+        RENDERBUFFER_HEIGHT              = 0x8D43,
+        RENDERBUFFER_INTERNAL_FORMAT     = 0x8D44,
+        STENCIL_INDEX1                   = 0x8D46,
+        STENCIL_INDEX4                   = 0x8D47,
+        STENCIL_INDEX8                   = 0x8D48,
+        STENCIL_INDEX16                  = 0x8D49,
+        RENDERBUFFER_RED_SIZE            = 0x8D50,
+        RENDERBUFFER_GREEN_SIZE          = 0x8D51,
+        RENDERBUFFER_BLUE_SIZE           = 0x8D52,
+        RENDERBUFFER_ALPHA_SIZE          = 0x8D53,
+        RENDERBUFFER_DEPTH_SIZE          = 0x8D54,
+        RENDERBUFFER_STENCIL_SIZE        = 0x8D55,
+        FRAMEBUFFER_INCOMPLETE_MULTISAMPLE = 0x8D56,
+        MAX_SAMPLES                      = 0x8D57,
+        TEXTURE_LUMINANCE_TYPE           = 0x8C14,
+        TEXTURE_INTENSITY_TYPE           = 0x8C15,
+
+        // Core Extension: ARB_depth_buffer_float
+        DEPTH_COMPONENT32F               = 0x8CAC,
+        DEPTH32F_STENCIL8                = 0x8CAD,
+        FLOAT_32_UNSIGNED_INT_24_8_REV   = 0x8DAD,
+
+        // Version: 3.0
+        COMPARE_REF_TO_TEXTURE           = 0x884E,
+        CLIP_DISTANCE0                   = 0x3000,
+        CLIP_DISTANCE1                   = 0x3001,
+        CLIP_DISTANCE2                   = 0x3002,
+        CLIP_DISTANCE3                   = 0x3003,
+        CLIP_DISTANCE4                   = 0x3004,
+        CLIP_DISTANCE5                   = 0x3005,
+        CLIP_DISTANCE6                   = 0x3006,
+        CLIP_DISTANCE7                   = 0x3007,
+        MAX_CLIP_DISTANCES               = 0x0D32,
+        MAJOR_VERSION                    = 0x821B,
+        MINOR_VERSION                    = 0x821C,
+        NUM_EXTENSIONS                   = 0x821D,
+        CONTEXT_FLAGS                    = 0x821E,
+        COMPRESSED_RED                   = 0x8225,
+        COMPRESSED_RG                    = 0x8226,
+        CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT = 0x0001,
+        RGBA32F                          = 0x8814,
+        RGB32F                           = 0x8815,
+        RGBA16F                          = 0x881A,
+        RGB16F                           = 0x881B,
+        VERTEX_ATTRIB_ARRAY_INTEGER      = 0x88FD,
+        MAX_ARRAY_TEXTURE_LAYERS         = 0x88FF,
+        MIN_PROGRAM_TEXEL_OFFSET         = 0x8904,
+        MAX_PROGRAM_TEXEL_OFFSET         = 0x8905,
+        CLAMP_READ_COLOR                 = 0x891C,
+        FIXED_ONLY                       = 0x891D,
+        TEXTURE_1D_ARRAY                 = 0x8C18,
+        PROXY_TEXTURE_1D_ARRAY           = 0x8C19,
+        TEXTURE_2D_ARRAY                 = 0x8C1A,
+        PROXY_TEXTURE_2D_ARRAY           = 0x8C1B,
+        TEXTURE_BINDING_1D_ARRAY         = 0x8C1C,
+        TEXTURE_BINDING_2D_ARRAY         = 0x8C1D,
+        R11F_G11F_B10F                   = 0x8C3A,
+        UNSIGNED_INT_10F_11F_11F_REV     = 0x8C3B,
+        RGB9_E5                          = 0x8C3D,
+        UNSIGNED_INT_5_9_9_9_REV         = 0x8C3E,
+        TEXTURE_SHARED_SIZE              = 0x8C3F,
+        TRANSFORM_FEEDBACK_VARYING_MAX_LENGTH = 0x8C76,
+        TRANSFORM_FEEDBACK_BUFFER_MODE   = 0x8C7F,
+        MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS = 0x8C80,
+        TRANSFORM_FEEDBACK_VARYINGS      = 0x8C83,
+        TRANSFORM_FEEDBACK_BUFFER_START  = 0x8C84,
+        TRANSFORM_FEEDBACK_BUFFER_SIZE   = 0x8C85,
+        PRIMITIVES_GENERATED             = 0x8C87,
+        TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN = 0x8C88,
+        RASTERIZER_DISCARD               = 0x8C89,
+        MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS = 0x8C8A,
+        MAX_TRANSFORM_FEEDBACK_SEPARATE_ATTRIBS = 0x8C8B,
+        INTERLEAVED_ATTRIBS              = 0x8C8C,
+        SEPARATE_ATTRIBS                 = 0x8C8D,
+        TRANSFORM_FEEDBACK_BUFFER        = 0x8C8E,
+        TRANSFORM_FEEDBACK_BUFFER_BINDING = 0x8C8F,
+        RGBA32UI                         = 0x8D70,
+        RGB32UI                          = 0x8D71,
+        RGBA16UI                         = 0x8D76,
+        RGB16UI                          = 0x8D77,
+        RGBA8UI                          = 0x8D7C,
+        RGB8UI                           = 0x8D7D,
+        RGBA32I                          = 0x8D82,
+        RGB32I                           = 0x8D83,
+        RGBA16I                          = 0x8D88,
+        RGB16I                           = 0x8D89,
+        RGBA8I                           = 0x8D8E,
+        RGB8I                            = 0x8D8F,
+        RED_INTEGER                      = 0x8D94,
+        GREEN_INTEGER                    = 0x8D95,
+        BLUE_INTEGER                     = 0x8D96,
+        RGB_INTEGER                      = 0x8D98,
+        RGBA_INTEGER                     = 0x8D99,
+        BGR_INTEGER                      = 0x8D9A,
+        BGRA_INTEGER                     = 0x8D9B,
+        SAMPLER_1D_ARRAY                 = 0x8DC0,
+        SAMPLER_2D_ARRAY                 = 0x8DC1,
+        SAMPLER_1D_ARRAY_SHADOW          = 0x8DC3,
+        SAMPLER_2D_ARRAY_SHADOW          = 0x8DC4,
+        SAMPLER_CUBE_SHADOW              = 0x8DC5,
+        UNSIGNED_INT_VEC2                = 0x8DC6,
+        UNSIGNED_INT_VEC3                = 0x8DC7,
+        UNSIGNED_INT_VEC4                = 0x8DC8,
+        INT_SAMPLER_1D                   = 0x8DC9,
+        INT_SAMPLER_2D                   = 0x8DCA,
+        INT_SAMPLER_3D                   = 0x8DCB,
+        INT_SAMPLER_CUBE                 = 0x8DCC,
+        INT_SAMPLER_1D_ARRAY             = 0x8DCE,
+        INT_SAMPLER_2D_ARRAY             = 0x8DCF,
+        UNSIGNED_INT_SAMPLER_1D          = 0x8DD1,
+        UNSIGNED_INT_SAMPLER_2D          = 0x8DD2,
+        UNSIGNED_INT_SAMPLER_3D          = 0x8DD3,
+        UNSIGNED_INT_SAMPLER_CUBE        = 0x8DD4,
+        UNSIGNED_INT_SAMPLER_1D_ARRAY    = 0x8DD6,
+        UNSIGNED_INT_SAMPLER_2D_ARRAY    = 0x8DD7,
+        QUERY_WAIT                       = 0x8E13,
+        QUERY_NO_WAIT                    = 0x8E14,
+        QUERY_BY_REGION_WAIT             = 0x8E15,
+        QUERY_BY_REGION_NO_WAIT          = 0x8E16,
+        BUFFER_ACCESS_FLAGS              = 0x911F,
+        BUFFER_MAP_LENGTH                = 0x9120,
+        BUFFER_MAP_OFFSET                = 0x9121,
+
+        // Core Extension: ARB_uniform_buffer_object
+        UNIFORM_BUFFER                   = 0x8A11,
+        UNIFORM_BUFFER_BINDING           = 0x8A28,
+        UNIFORM_BUFFER_START             = 0x8A29,
+        UNIFORM_BUFFER_SIZE              = 0x8A2A,
+        MAX_VERTEX_UNIFORM_BLOCKS        = 0x8A2B,
+        MAX_FRAGMENT_UNIFORM_BLOCKS      = 0x8A2D,
+        MAX_COMBINED_UNIFORM_BLOCKS      = 0x8A2E,
+        MAX_UNIFORM_BUFFER_BINDINGS      = 0x8A2F,
+        MAX_UNIFORM_BLOCK_SIZE           = 0x8A30,
+        MAX_COMBINED_VERTEX_UNIFORM_COMPONENTS = 0x8A31,
+        MAX_COMBINED_FRAGMENT_UNIFORM_COMPONENTS = 0x8A33,
+        UNIFORM_BUFFER_OFFSET_ALIGNMENT  = 0x8A34,
+        ACTIVE_UNIFORM_BLOCK_MAX_NAME_LENGTH = 0x8A35,
+        ACTIVE_UNIFORM_BLOCKS            = 0x8A36,
+        UNIFORM_TYPE                     = 0x8A37,
+        UNIFORM_SIZE                     = 0x8A38,
+        UNIFORM_NAME_LENGTH              = 0x8A39,
+        UNIFORM_BLOCK_INDEX              = 0x8A3A,
+        UNIFORM_OFFSET                   = 0x8A3B,
+        UNIFORM_ARRAY_STRIDE             = 0x8A3C,
+        UNIFORM_MATRIX_STRIDE            = 0x8A3D,
+        UNIFORM_IS_ROW_MAJOR             = 0x8A3E,
+        UNIFORM_BLOCK_BINDING            = 0x8A3F,
+        UNIFORM_BLOCK_DATA_SIZE          = 0x8A40,
+        UNIFORM_BLOCK_NAME_LENGTH        = 0x8A41,
+        UNIFORM_BLOCK_ACTIVE_UNIFORMS    = 0x8A42,
+        UNIFORM_BLOCK_ACTIVE_UNIFORM_INDICES = 0x8A43,
+        UNIFORM_BLOCK_REFERENCED_BY_VERTEX_SHADER = 0x8A44,
+        UNIFORM_BLOCK_REFERENCED_BY_FRAGMENT_SHADER = 0x8A46,
+        INVALID_INDEX                    = 0xFFFFFFFF,
+        MAX_GEOMETRY_UNIFORM_BLOCKS      = 0x8A2C,
+        MAX_COMBINED_GEOMETRY_UNIFORM_COMPONENTS = 0x8A32,
+        UNIFORM_BLOCK_REFERENCED_BY_GEOMETRY_SHADER = 0x8A45,
+
+        // Core Extension: ARB_copy_buffer
+        COPY_READ_BUFFER                 = 0x8F36,
+        COPY_WRITE_BUFFER                = 0x8F37,
+        COPY_READ_BUFFER_BINDING         = 0x8F36,
+        COPY_WRITE_BUFFER_BINDING        = 0x8F37,
+
+        // Version: 3.1
+        SAMPLER_2D_RECT                  = 0x8B63,
+        SAMPLER_2D_RECT_SHADOW           = 0x8B64,
+        SAMPLER_BUFFER                   = 0x8DC2,
+        INT_SAMPLER_2D_RECT              = 0x8DCD,
+        INT_SAMPLER_BUFFER               = 0x8DD0,
+        UNSIGNED_INT_SAMPLER_2D_RECT     = 0x8DD5,
+        UNSIGNED_INT_SAMPLER_BUFFER      = 0x8DD8,
+        TEXTURE_BUFFER                   = 0x8C2A,
+        MAX_TEXTURE_BUFFER_SIZE          = 0x8C2B,
+        TEXTURE_BINDING_BUFFER           = 0x8C2C,
+        TEXTURE_BUFFER_DATA_STORE_BINDING = 0x8C2D,
+        TEXTURE_RECTANGLE                = 0x84F5,
+        TEXTURE_BINDING_RECTANGLE        = 0x84F6,
+        PROXY_TEXTURE_RECTANGLE          = 0x84F7,
+        MAX_RECTANGLE_TEXTURE_SIZE       = 0x84F8,
+        RED_SNORM                        = 0x8F90,
+        RG_SNORM                         = 0x8F91,
+        RGB_SNORM                        = 0x8F92,
+        RGBA_SNORM                       = 0x8F93,
+        R8_SNORM                         = 0x8F94,
+        RG8_SNORM                        = 0x8F95,
+        RGB8_SNORM                       = 0x8F96,
+        RGBA8_SNORM                      = 0x8F97,
+        R16_SNORM                        = 0x8F98,
+        RG16_SNORM                       = 0x8F99,
+        RGB16_SNORM                      = 0x8F9A,
+        RGBA16_SNORM                     = 0x8F9B,
+        SIGNED_NORMALIZED                = 0x8F9C,
+        PRIMITIVE_RESTART                = 0x8F9D,
+        PRIMITIVE_RESTART_INDEX          = 0x8F9E,
+
+        // Legacy
+        VERTEX_ARRAY = 0x8074,
+        NORMAL_ARRAY = 0x8075,
+        COLOR_ARRAY = 0x8076,
+        TEXTURE_COORD_ARRAY = 0x8078,
+        TEXTURE_ENV = 0x2300,
+        TEXTURE_ENV_MODE = 0x2200,
+        MODELVIEW = 0x1700,
+        PROJECTION = 0x1701,
+        LIGHTING = 0x0B50
+    };
+
+    // Extension: 1.1
+    extern void (CODEGEN_FUNCPTR *CullFace)(GLenum mode);
+    extern void (CODEGEN_FUNCPTR *FrontFace)(GLenum mode);
+    extern void (CODEGEN_FUNCPTR *Hint)(GLenum target, GLenum mode);
+    extern void (CODEGEN_FUNCPTR *LineWidth)(GLfloat width);
+    extern void (CODEGEN_FUNCPTR *PointSize)(GLfloat size);
+    extern void (CODEGEN_FUNCPTR *PolygonMode)(GLenum face, GLenum mode);
+    extern void (CODEGEN_FUNCPTR *Scissor)(GLint x, GLint y, GLsizei width, GLsizei height);
+    extern void (CODEGEN_FUNCPTR *TexParameterf)(GLenum target, GLenum pname, GLfloat param);
+    extern void (CODEGEN_FUNCPTR *TexParameterfv)(GLenum target, GLenum pname, const GLfloat *params);
+    extern void (CODEGEN_FUNCPTR *TexParameteri)(GLenum target, GLenum pname, GLint param);
+    extern void (CODEGEN_FUNCPTR *TexParameteriv)(GLenum target, GLenum pname, const GLint *params);
+    extern void (CODEGEN_FUNCPTR *TexImage1D)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLint border, GLenum format, GLenum type, const GLvoid *pixels);
+    extern void (CODEGEN_FUNCPTR *TexImage2D)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const GLvoid *pixels);
+    extern void (CODEGEN_FUNCPTR *DrawBuffer)(GLenum mode);
+    extern void (CODEGEN_FUNCPTR *Clear)(GLbitfield mask);
+    extern void (CODEGEN_FUNCPTR *ClearColor)(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha);
+    extern void (CODEGEN_FUNCPTR *ClearStencil)(GLint s);
+    extern void (CODEGEN_FUNCPTR *ClearDepth)(GLdouble depth);
+    extern void (CODEGEN_FUNCPTR *StencilMask)(GLuint mask);
+    extern void (CODEGEN_FUNCPTR *ColorMask)(GLboolean red, GLboolean green, GLboolean blue, GLboolean alpha);
+    extern void (CODEGEN_FUNCPTR *DepthMask)(GLboolean flag);
+    extern void (CODEGEN_FUNCPTR *Disable)(GLenum cap);
+    extern void (CODEGEN_FUNCPTR *Enable)(GLenum cap);
+    extern void (CODEGEN_FUNCPTR *Finish)();
+    extern void (CODEGEN_FUNCPTR *Flush)();
+    extern void (CODEGEN_FUNCPTR *BlendFunc)(GLenum sfactor, GLenum dfactor);
+    extern void (CODEGEN_FUNCPTR *LogicOp)(GLenum opcode);
+    extern void (CODEGEN_FUNCPTR *StencilFunc)(GLenum func, GLint ref, GLuint mask);
+    extern void (CODEGEN_FUNCPTR *StencilOp)(GLenum fail, GLenum zfail, GLenum zpass);
+    extern void (CODEGEN_FUNCPTR *DepthFunc)(GLenum func);
+    extern void (CODEGEN_FUNCPTR *PixelStoref)(GLenum pname, GLfloat param);
+    extern void (CODEGEN_FUNCPTR *PixelStorei)(GLenum pname, GLint param);
+    extern void (CODEGEN_FUNCPTR *ReadBuffer)(GLenum mode);
+    extern void (CODEGEN_FUNCPTR *ReadPixels)(GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, GLvoid *pixels);
+    extern void (CODEGEN_FUNCPTR *GetBooleanv)(GLenum pname, GLboolean *params);
+    extern void (CODEGEN_FUNCPTR *GetDoublev)(GLenum pname, GLdouble *params);
+    extern GLenum (CODEGEN_FUNCPTR *GetError)();
+    extern void (CODEGEN_FUNCPTR *GetFloatv)(GLenum pname, GLfloat *params);
+    extern void (CODEGEN_FUNCPTR *GetIntegerv)(GLenum pname, GLint *params);
+    extern const GLubyte * (CODEGEN_FUNCPTR *GetString)(GLenum name);
+    extern void (CODEGEN_FUNCPTR *GetTexImage)(GLenum target, GLint level, GLenum format, GLenum type, GLvoid *pixels);
+    extern void (CODEGEN_FUNCPTR *GetTexParameterfv)(GLenum target, GLenum pname, GLfloat *params);
+    extern void (CODEGEN_FUNCPTR *GetTexParameteriv)(GLenum target, GLenum pname, GLint *params);
+    extern void (CODEGEN_FUNCPTR *GetTexLevelParameterfv)(GLenum target, GLint level, GLenum pname, GLfloat *params);
+    extern void (CODEGEN_FUNCPTR *GetTexLevelParameteriv)(GLenum target, GLint level, GLenum pname, GLint *params);
+    extern GLboolean (CODEGEN_FUNCPTR *IsEnabled)(GLenum cap);
+    extern void (CODEGEN_FUNCPTR *DepthRange)(GLdouble ren_near, GLdouble ren_far);
+    extern void (CODEGEN_FUNCPTR *Viewport)(GLint x, GLint y, GLsizei width, GLsizei height);
+    extern void (CODEGEN_FUNCPTR *DrawArrays)(GLenum mode, GLint first, GLsizei count);
+    extern void (CODEGEN_FUNCPTR *DrawElements)(GLenum mode, GLsizei count, GLenum type, const GLvoid *indices);
+    extern void (CODEGEN_FUNCPTR *GetPointerv)(GLenum pname, GLvoid* *params);
+    extern void (CODEGEN_FUNCPTR *PolygonOffset)(GLfloat factor, GLfloat units);
+    extern void (CODEGEN_FUNCPTR *CopyTexImage1D)(GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLint border);
+    extern void (CODEGEN_FUNCPTR *CopyTexImage2D)(GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLsizei height, GLint border);
+    extern void (CODEGEN_FUNCPTR *CopyTexSubImage1D)(GLenum target, GLint level, GLint xoffset, GLint x, GLint y, GLsizei width);
+    extern void (CODEGEN_FUNCPTR *CopyTexSubImage2D)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+    extern void (CODEGEN_FUNCPTR *TexSubImage1D)(GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLenum type, const GLvoid *pixels);
+    extern void (CODEGEN_FUNCPTR *TexSubImage2D)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid *pixels);
+    extern void (CODEGEN_FUNCPTR *BindTexture)(GLenum target, GLuint texture);
+    extern void (CODEGEN_FUNCPTR *DeleteTextures)(GLsizei n, const GLuint *textures);
+    extern void (CODEGEN_FUNCPTR *GenTextures)(GLsizei n, GLuint *textures);
+    extern GLboolean (CODEGEN_FUNCPTR *IsTexture)(GLuint texture);
+    extern void (CODEGEN_FUNCPTR *Indexub)(GLubyte c);
+    extern void (CODEGEN_FUNCPTR *Indexubv)(const GLubyte *c);
+
+    // Extension: 1.2
+    extern void (CODEGEN_FUNCPTR *BlendColor)(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha);
+    extern void (CODEGEN_FUNCPTR *BlendEquation)(GLenum mode);
+    extern void (CODEGEN_FUNCPTR *DrawRangeElements)(GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid *indices);
+    extern void (CODEGEN_FUNCPTR *TexSubImage3D)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid *pixels);
+    extern void (CODEGEN_FUNCPTR *CopyTexSubImage3D)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+
+    // Extension: 1.3
+    extern void (CODEGEN_FUNCPTR *ActiveTexture)(GLenum texture);
+    extern void (CODEGEN_FUNCPTR *SampleCoverage)(GLfloat value, GLboolean invert);
+    extern void (CODEGEN_FUNCPTR *CompressedTexImage3D)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid *data);
+    extern void (CODEGEN_FUNCPTR *CompressedTexImage2D)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const GLvoid *data);
+    extern void (CODEGEN_FUNCPTR *CompressedTexImage1D)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLint border, GLsizei imageSize, const GLvoid *data);
+    extern void (CODEGEN_FUNCPTR *CompressedTexSubImage3D)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const GLvoid *data);
+    extern void (CODEGEN_FUNCPTR *CompressedTexSubImage2D)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLsizei imageSize, const GLvoid *data);
+    extern void (CODEGEN_FUNCPTR *CompressedTexSubImage1D)(GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLsizei imageSize, const GLvoid *data);
+    extern void (CODEGEN_FUNCPTR *GetCompressedTexImage)(GLenum target, GLint level, GLvoid *img);
+
+    // Extension: 1.4
+    extern void (CODEGEN_FUNCPTR *BlendFuncSeparate)(GLenum sfactorRGB, GLenum dfactorRGB, GLenum sfactorAlpha, GLenum dfactorAlpha);
+    extern void (CODEGEN_FUNCPTR *MultiDrawArrays)(GLenum mode, const GLint *first, const GLsizei *count, GLsizei drawcount);
+    extern void (CODEGEN_FUNCPTR *MultiDrawElements)(GLenum mode, const GLsizei *count, GLenum type, const GLvoid* const *indices, GLsizei drawcount);
+    extern void (CODEGEN_FUNCPTR *PointParameterf)(GLenum pname, GLfloat param);
+    extern void (CODEGEN_FUNCPTR *PointParameterfv)(GLenum pname, const GLfloat *params);
+    extern void (CODEGEN_FUNCPTR *PointParameteri)(GLenum pname, GLint param);
+    extern void (CODEGEN_FUNCPTR *PointParameteriv)(GLenum pname, const GLint *params);
+
+    // Extension: 1.5
+    extern void (CODEGEN_FUNCPTR *GenQueries)(GLsizei n, GLuint *ids);
+    extern void (CODEGEN_FUNCPTR *DeleteQueries)(GLsizei n, const GLuint *ids);
+    extern GLboolean (CODEGEN_FUNCPTR *IsQuery)(GLuint id);
+    extern void (CODEGEN_FUNCPTR *BeginQuery)(GLenum target, GLuint id);
+    extern void (CODEGEN_FUNCPTR *EndQuery)(GLenum target);
+    extern void (CODEGEN_FUNCPTR *GetQueryiv)(GLenum target, GLenum pname, GLint *params);
+    extern void (CODEGEN_FUNCPTR *GetQueryObjectiv)(GLuint id, GLenum pname, GLint *params);
+    extern void (CODEGEN_FUNCPTR *GetQueryObjectuiv)(GLuint id, GLenum pname, GLuint *params);
+    extern void (CODEGEN_FUNCPTR *BindBuffer)(GLenum target, GLuint buffer);
+    extern void (CODEGEN_FUNCPTR *DeleteBuffers)(GLsizei n, const GLuint *buffers);
+    extern void (CODEGEN_FUNCPTR *GenBuffers)(GLsizei n, GLuint *buffers);
+    extern GLboolean (CODEGEN_FUNCPTR *IsBuffer)(GLuint buffer);
+    extern void (CODEGEN_FUNCPTR *BufferData)(GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage);
+    extern void (CODEGEN_FUNCPTR *BufferSubData)(GLenum target, GLintptr offset, GLsizeiptr size, const GLvoid *data);
+    extern void (CODEGEN_FUNCPTR *GetBufferSubData)(GLenum target, GLintptr offset, GLsizeiptr size, GLvoid *data);
+    extern GLvoid* (CODEGEN_FUNCPTR *MapBuffer)(GLenum target, GLenum access);
+    extern GLboolean (CODEGEN_FUNCPTR *UnmapBuffer)(GLenum target);
+    extern void (CODEGEN_FUNCPTR *GetBufferParameteriv)(GLenum target, GLenum pname, GLint *params);
+    extern void (CODEGEN_FUNCPTR *GetBufferPointerv)(GLenum target, GLenum pname, GLvoid* *params);
+
+    // Extension: 2.0
+    extern void (CODEGEN_FUNCPTR *BlendEquationSeparate)(GLenum modeRGB, GLenum modeAlpha);
+    extern void (CODEGEN_FUNCPTR *DrawBuffers)(GLsizei n, const GLenum *bufs);
+    extern void (CODEGEN_FUNCPTR *StencilOpSeparate)(GLenum face, GLenum sfail, GLenum dpfail, GLenum dppass);
+    extern void (CODEGEN_FUNCPTR *StencilFuncSeparate)(GLenum face, GLenum func, GLint ref, GLuint mask);
+    extern void (CODEGEN_FUNCPTR *StencilMaskSeparate)(GLenum face, GLuint mask);
+    extern void (CODEGEN_FUNCPTR *AttachShader)(GLuint program, GLuint shader);
+    extern void (CODEGEN_FUNCPTR *BindAttribLocation)(GLuint program, GLuint index, const GLchar *name);
+    extern void (CODEGEN_FUNCPTR *CompileShader)(GLuint shader);
+    extern GLuint (CODEGEN_FUNCPTR *CreateProgram)();
+    extern GLuint (CODEGEN_FUNCPTR *CreateShader)(GLenum type);
+    extern void (CODEGEN_FUNCPTR *DeleteProgram)(GLuint program);
+    extern void (CODEGEN_FUNCPTR *DeleteShader)(GLuint shader);
+    extern void (CODEGEN_FUNCPTR *DetachShader)(GLuint program, GLuint shader);
+    extern void (CODEGEN_FUNCPTR *DisableVertexAttribArray)(GLuint index);
+    extern void (CODEGEN_FUNCPTR *EnableVertexAttribArray)(GLuint index);
+    extern void (CODEGEN_FUNCPTR *GetActiveAttrib)(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLint *size, GLenum *type, GLchar *name);
+    extern void (CODEGEN_FUNCPTR *GetActiveUniform)(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLint *size, GLenum *type, GLchar *name);
+    extern void (CODEGEN_FUNCPTR *GetAttachedShaders)(GLuint program, GLsizei maxCount, GLsizei *count, GLuint *obj);
+    extern GLint (CODEGEN_FUNCPTR *GetAttribLocation)(GLuint program, const GLchar *name);
+    extern void (CODEGEN_FUNCPTR *GetProgramiv)(GLuint program, GLenum pname, GLint *params);
+    extern void (CODEGEN_FUNCPTR *GetProgramInfoLog)(GLuint program, GLsizei bufSize, GLsizei *length, GLchar *infoLog);
+    extern void (CODEGEN_FUNCPTR *GetShaderiv)(GLuint shader, GLenum pname, GLint *params);
+    extern void (CODEGEN_FUNCPTR *GetShaderInfoLog)(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *infoLog);
+    extern void (CODEGEN_FUNCPTR *GetShaderSource)(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *source);
+    extern GLint (CODEGEN_FUNCPTR *GetUniformLocation)(GLuint program, const GLchar *name);
+    extern void (CODEGEN_FUNCPTR *GetUniformfv)(GLuint program, GLint location, GLfloat *params);
+    extern void (CODEGEN_FUNCPTR *GetUniformiv)(GLuint program, GLint location, GLint *params);
+    extern void (CODEGEN_FUNCPTR *GetVertexAttribdv)(GLuint index, GLenum pname, GLdouble *params);
+    extern void (CODEGEN_FUNCPTR *GetVertexAttribfv)(GLuint index, GLenum pname, GLfloat *params);
+    extern void (CODEGEN_FUNCPTR *GetVertexAttribiv)(GLuint index, GLenum pname, GLint *params);
+    extern void (CODEGEN_FUNCPTR *GetVertexAttribPointerv)(GLuint index, GLenum pname, GLvoid* *pointer);
+    extern GLboolean (CODEGEN_FUNCPTR *IsProgram)(GLuint program);
+    extern GLboolean (CODEGEN_FUNCPTR *IsShader)(GLuint shader);
+    extern void (CODEGEN_FUNCPTR *LinkProgram)(GLuint program);
+    extern void (CODEGEN_FUNCPTR *ShaderSource)(GLuint shader, GLsizei count, const GLchar* const *string, const GLint *length);
+    extern void (CODEGEN_FUNCPTR *UseProgram)(GLuint program);
+    extern void (CODEGEN_FUNCPTR *Uniform1f)(GLint location, GLfloat v0);
+    extern void (CODEGEN_FUNCPTR *Uniform2f)(GLint location, GLfloat v0, GLfloat v1);
+    extern void (CODEGEN_FUNCPTR *Uniform3f)(GLint location, GLfloat v0, GLfloat v1, GLfloat v2);
+    extern void (CODEGEN_FUNCPTR *Uniform4f)(GLint location, GLfloat v0, GLfloat v1, GLfloat v2, GLfloat v3);
+    extern void (CODEGEN_FUNCPTR *Uniform1i)(GLint location, GLint v0);
+    extern void (CODEGEN_FUNCPTR *Uniform2i)(GLint location, GLint v0, GLint v1);
+    extern void (CODEGEN_FUNCPTR *Uniform3i)(GLint location, GLint v0, GLint v1, GLint v2);
+    extern void (CODEGEN_FUNCPTR *Uniform4i)(GLint location, GLint v0, GLint v1, GLint v2, GLint v3);
+    extern void (CODEGEN_FUNCPTR *Uniform1fv)(GLint location, GLsizei count, const GLfloat *value);
+    extern void (CODEGEN_FUNCPTR *Uniform2fv)(GLint location, GLsizei count, const GLfloat *value);
+    extern void (CODEGEN_FUNCPTR *Uniform3fv)(GLint location, GLsizei count, const GLfloat *value);
+    extern void (CODEGEN_FUNCPTR *Uniform4fv)(GLint location, GLsizei count, const GLfloat *value);
+    extern void (CODEGEN_FUNCPTR *Uniform1iv)(GLint location, GLsizei count, const GLint *value);
+    extern void (CODEGEN_FUNCPTR *Uniform2iv)(GLint location, GLsizei count, const GLint *value);
+    extern void (CODEGEN_FUNCPTR *Uniform3iv)(GLint location, GLsizei count, const GLint *value);
+    extern void (CODEGEN_FUNCPTR *Uniform4iv)(GLint location, GLsizei count, const GLint *value);
+    extern void (CODEGEN_FUNCPTR *UniformMatrix2fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+    extern void (CODEGEN_FUNCPTR *UniformMatrix3fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+    extern void (CODEGEN_FUNCPTR *UniformMatrix4fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+    extern void (CODEGEN_FUNCPTR *ValidateProgram)(GLuint program);
+    extern void (CODEGEN_FUNCPTR *VertexAttribPointer)(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const GLvoid *pointer);
+
+    // Extension: 2.1
+    extern void (CODEGEN_FUNCPTR *UniformMatrix2x3fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+    extern void (CODEGEN_FUNCPTR *UniformMatrix3x2fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+    extern void (CODEGEN_FUNCPTR *UniformMatrix2x4fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+    extern void (CODEGEN_FUNCPTR *UniformMatrix4x2fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+    extern void (CODEGEN_FUNCPTR *UniformMatrix3x4fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+    extern void (CODEGEN_FUNCPTR *UniformMatrix4x3fv)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+
+    // Extension: ARB_vertex_array_object
+    extern void (CODEGEN_FUNCPTR *BindVertexArray)(GLuint ren_array);
+    extern void (CODEGEN_FUNCPTR *DeleteVertexArrays)(GLsizei n, const GLuint *arrays);
+    extern void (CODEGEN_FUNCPTR *GenVertexArrays)(GLsizei n, GLuint *arrays);
+    extern GLboolean (CODEGEN_FUNCPTR *IsVertexArray)(GLuint ren_array);
+
+    // Extension: ARB_map_buffer_range
+    extern GLvoid* (CODEGEN_FUNCPTR *MapBufferRange)(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
+    extern void (CODEGEN_FUNCPTR *FlushMappedBufferRange)(GLenum target, GLintptr offset, GLsizeiptr length);
+
+    // Extension: ARB_framebuffer_object
+    extern GLboolean (CODEGEN_FUNCPTR *IsRenderbuffer)(GLuint renderbuffer);
+    extern void (CODEGEN_FUNCPTR *BindRenderbuffer)(GLenum target, GLuint renderbuffer);
+    extern void (CODEGEN_FUNCPTR *DeleteRenderbuffers)(GLsizei n, const GLuint *renderbuffers);
+    extern void (CODEGEN_FUNCPTR *GenRenderbuffers)(GLsizei n, GLuint *renderbuffers);
+    extern void (CODEGEN_FUNCPTR *RenderbufferStorage)(GLenum target, GLenum internalformat, GLsizei width, GLsizei height);
+    extern void (CODEGEN_FUNCPTR *GetRenderbufferParameteriv)(GLenum target, GLenum pname, GLint *params);
+    extern GLboolean (CODEGEN_FUNCPTR *IsFramebuffer)(GLuint framebuffer);
+    extern void (CODEGEN_FUNCPTR *BindFramebuffer)(GLenum target, GLuint framebuffer);
+    extern void (CODEGEN_FUNCPTR *DeleteFramebuffers)(GLsizei n, const GLuint *framebuffers);
+    extern void (CODEGEN_FUNCPTR *GenFramebuffers)(GLsizei n, GLuint *framebuffers);
+    extern GLenum (CODEGEN_FUNCPTR *CheckFramebufferStatus)(GLenum target);
+    extern void (CODEGEN_FUNCPTR *FramebufferTexture1D)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
+    extern void (CODEGEN_FUNCPTR *FramebufferTexture2D)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
+    extern void (CODEGEN_FUNCPTR *FramebufferTexture3D)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLint zoffset);
+    extern void (CODEGEN_FUNCPTR *FramebufferRenderbuffer)(GLenum target, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer);
+    extern void (CODEGEN_FUNCPTR *GetFramebufferAttachmentParameteriv)(GLenum target, GLenum attachment, GLenum pname, GLint *params);
+    extern void (CODEGEN_FUNCPTR *GenerateMipmap)(GLenum target);
+    extern void (CODEGEN_FUNCPTR *BlitFramebuffer)(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
+    extern void (CODEGEN_FUNCPTR *RenderbufferStorageMultisample)(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
+    extern void (CODEGEN_FUNCPTR *FramebufferTextureLayer)(GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer);
+
+    // Extension: 3.0
+    extern void (CODEGEN_FUNCPTR *ColorMaski)(GLuint index, GLboolean r, GLboolean g, GLboolean b, GLboolean a);
+    extern void (CODEGEN_FUNCPTR *GetBooleani_v)(GLenum target, GLuint index, GLboolean *data);
+    extern void (CODEGEN_FUNCPTR *GetIntegeri_v)(GLenum target, GLuint index, GLint *data);
+    extern void (CODEGEN_FUNCPTR *Enablei)(GLenum target, GLuint index);
+    extern void (CODEGEN_FUNCPTR *Disablei)(GLenum target, GLuint index);
+    extern GLboolean (CODEGEN_FUNCPTR *IsEnabledi)(GLenum target, GLuint index);
+    extern void (CODEGEN_FUNCPTR *BeginTransformFeedback)(GLenum primitiveMode);
+    extern void (CODEGEN_FUNCPTR *EndTransformFeedback)();
+    extern void (CODEGEN_FUNCPTR *BindBufferRange)(GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size);
+    extern void (CODEGEN_FUNCPTR *BindBufferBase)(GLenum target, GLuint index, GLuint buffer);
+    extern void (CODEGEN_FUNCPTR *TransformFeedbackVaryings)(GLuint program, GLsizei count, const GLchar* const *varyings, GLenum bufferMode);
+    extern void (CODEGEN_FUNCPTR *GetTransformFeedbackVarying)(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLsizei *size, GLenum *type, GLchar *name);
+    extern void (CODEGEN_FUNCPTR *ClampColor)(GLenum target, GLenum clamp);
+    extern void (CODEGEN_FUNCPTR *BeginConditionalRender)(GLuint id, GLenum mode);
+    extern void (CODEGEN_FUNCPTR *EndConditionalRender)();
+    extern void (CODEGEN_FUNCPTR *VertexAttribIPointer)(GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid *pointer);
+    extern void (CODEGEN_FUNCPTR *GetVertexAttribIiv)(GLuint index, GLenum pname, GLint *params);
+    extern void (CODEGEN_FUNCPTR *GetVertexAttribIuiv)(GLuint index, GLenum pname, GLuint *params);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI1i)(GLuint index, GLint x);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI2i)(GLuint index, GLint x, GLint y);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI3i)(GLuint index, GLint x, GLint y, GLint z);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI4i)(GLuint index, GLint x, GLint y, GLint z, GLint w);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI1ui)(GLuint index, GLuint x);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI2ui)(GLuint index, GLuint x, GLuint y);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI3ui)(GLuint index, GLuint x, GLuint y, GLuint z);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI4ui)(GLuint index, GLuint x, GLuint y, GLuint z, GLuint w);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI1iv)(GLuint index, const GLint *v);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI2iv)(GLuint index, const GLint *v);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI3iv)(GLuint index, const GLint *v);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI4iv)(GLuint index, const GLint *v);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI1uiv)(GLuint index, const GLuint *v);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI2uiv)(GLuint index, const GLuint *v);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI3uiv)(GLuint index, const GLuint *v);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI4uiv)(GLuint index, const GLuint *v);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI4bv)(GLuint index, const GLbyte *v);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI4sv)(GLuint index, const GLshort *v);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI4ubv)(GLuint index, const GLubyte *v);
+    extern void (CODEGEN_FUNCPTR *VertexAttribI4usv)(GLuint index, const GLushort *v);
+    extern void (CODEGEN_FUNCPTR *GetUniformuiv)(GLuint program, GLint location, GLuint *params);
+    extern void (CODEGEN_FUNCPTR *BindFragDataLocation)(GLuint program, GLuint color, const GLchar *name);
+    extern GLint (CODEGEN_FUNCPTR *GetFragDataLocation)(GLuint program, const GLchar *name);
+    extern void (CODEGEN_FUNCPTR *Uniform1ui)(GLint location, GLuint v0);
+    extern void (CODEGEN_FUNCPTR *Uniform2ui)(GLint location, GLuint v0, GLuint v1);
+    extern void (CODEGEN_FUNCPTR *Uniform3ui)(GLint location, GLuint v0, GLuint v1, GLuint v2);
+    extern void (CODEGEN_FUNCPTR *Uniform4ui)(GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3);
+    extern void (CODEGEN_FUNCPTR *Uniform1uiv)(GLint location, GLsizei count, const GLuint *value);
+    extern void (CODEGEN_FUNCPTR *Uniform2uiv)(GLint location, GLsizei count, const GLuint *value);
+    extern void (CODEGEN_FUNCPTR *Uniform3uiv)(GLint location, GLsizei count, const GLuint *value);
+    extern void (CODEGEN_FUNCPTR *Uniform4uiv)(GLint location, GLsizei count, const GLuint *value);
+    extern void (CODEGEN_FUNCPTR *TexParameterIiv)(GLenum target, GLenum pname, const GLint *params);
+    extern void (CODEGEN_FUNCPTR *TexParameterIuiv)(GLenum target, GLenum pname, const GLuint *params);
+    extern void (CODEGEN_FUNCPTR *GetTexParameterIiv)(GLenum target, GLenum pname, GLint *params);
+    extern void (CODEGEN_FUNCPTR *GetTexParameterIuiv)(GLenum target, GLenum pname, GLuint *params);
+    extern void (CODEGEN_FUNCPTR *ClearBufferiv)(GLenum buffer, GLint drawbuffer, const GLint *value);
+    extern void (CODEGEN_FUNCPTR *ClearBufferuiv)(GLenum buffer, GLint drawbuffer, const GLuint *value);
+    extern void (CODEGEN_FUNCPTR *ClearBufferfv)(GLenum buffer, GLint drawbuffer, const GLfloat *value);
+    extern void (CODEGEN_FUNCPTR *ClearBufferfi)(GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil);
+    extern const GLubyte * (CODEGEN_FUNCPTR *GetStringi)(GLenum name, GLuint index);
+
+    // Extension: ARB_uniform_buffer_object
+    extern void (CODEGEN_FUNCPTR *GetUniformIndices)(GLuint program, GLsizei uniformCount, const GLchar* const *uniformNames, GLuint *uniformIndices);
+    extern void (CODEGEN_FUNCPTR *GetActiveUniformsiv)(GLuint program, GLsizei uniformCount, const GLuint *uniformIndices, GLenum pname, GLint *params);
+    extern void (CODEGEN_FUNCPTR *GetActiveUniformName)(GLuint program, GLuint uniformIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformName);
+    extern GLuint (CODEGEN_FUNCPTR *GetUniformBlockIndex)(GLuint program, const GLchar *uniformBlockName);
+    extern void (CODEGEN_FUNCPTR *GetActiveUniformBlockiv)(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params);
+    extern void (CODEGEN_FUNCPTR *GetActiveUniformBlockName)(GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformBlockName);
+    extern void (CODEGEN_FUNCPTR *UniformBlockBinding)(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
+
+    // Extension: ARB_copy_buffer
+    extern void (CODEGEN_FUNCPTR *CopyBufferSubData)(GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
+
+    // Extension: 3.1
+    extern void (CODEGEN_FUNCPTR *DrawArraysInstanced)(GLenum mode, GLint first, GLsizei count, GLsizei instancecount);
+    extern void (CODEGEN_FUNCPTR *DrawElementsInstanced)(GLenum mode, GLsizei count, GLenum type, const GLvoid *indices, GLsizei instancecount);
+    extern void (CODEGEN_FUNCPTR *TexBuffer)(GLenum target, GLenum internalformat, GLuint buffer);
+    extern void (CODEGEN_FUNCPTR *PrimitiveRestartIndex)(GLuint index);
+
+    // Legacy
+    extern void (CODEGEN_FUNCPTR *EnableClientState)(GLenum cap);
+    extern void (CODEGEN_FUNCPTR *DisableClientState)(GLenum cap);
+    extern void (CODEGEN_FUNCPTR *VertexPointer)(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr);
+    extern void (CODEGEN_FUNCPTR *NormalPointer)(GLenum type, GLsizei stride, const GLvoid *ptr);
+    extern void (CODEGEN_FUNCPTR *ColorPointer)(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr);
+    extern void (CODEGEN_FUNCPTR *TexCoordPointer)(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr);
+    extern void (CODEGEN_FUNCPTR *TexEnvi)(GLenum target, GLenum pname, GLint param);
+    extern void (CODEGEN_FUNCPTR *MatrixMode)(GLenum mode);
+    extern void (CODEGEN_FUNCPTR *LoadIdentity)(void);
+    extern void (CODEGEN_FUNCPTR *Ortho)(GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble near_val, GLdouble far_val);
+    extern void (CODEGEN_FUNCPTR *Color3d)(GLdouble red, GLdouble green, GLdouble blue);
+}
+
+#endif // OPENGL_NOLOAD_STYLE_HPP
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index fc291a862a..43f4d613bc 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -45,8 +45,7 @@
 #include <iostream>
 
 #ifdef HAVE_CUDA
-    #include <cuda.h>
-    #include <cuda_runtime_api.h>
+    #include <cuda_runtime.h>
     #include <npp.h>
 
     #define CUDART_MINIMUM_REQUIRED_VERSION 4010
@@ -69,33 +68,89 @@ using namespace cv::gpu;
 
 namespace
 {
-    // Compares value to set using the given comparator. Returns true if
-    // there is at least one element x in the set satisfying to: x cmp value
-    // predicate.
-    template <typename Comparer>
-    bool compareToSet(const std::string& set_as_str, int value, Comparer cmp)
+    class CudaArch
+    {
+    public:
+        CudaArch();
+
+        bool builtWith(FeatureSet feature_set) const;
+        bool hasPtx(int major, int minor) const;
+        bool hasBin(int major, int minor) const;
+        bool hasEqualOrLessPtx(int major, int minor) const;
+        bool hasEqualOrGreaterPtx(int major, int minor) const;
+        bool hasEqualOrGreaterBin(int major, int minor) const;
+
+    private:
+        static void fromStr(const string& set_as_str, vector<int>& arr);
+
+        vector<int> bin;
+        vector<int> ptx;
+        vector<int> features;
+    };
+
+    const CudaArch cudaArch;
+
+    CudaArch::CudaArch()
+    {
+    #ifdef HAVE_CUDA
+        fromStr(CUDA_ARCH_BIN, bin);
+        fromStr(CUDA_ARCH_PTX, ptx);
+        fromStr(CUDA_ARCH_FEATURES, features);
+    #endif
+    }
+
+    bool CudaArch::builtWith(FeatureSet feature_set) const
+    {
+        return !features.empty() && (features.back() >= feature_set);
+    }
+
+    bool CudaArch::hasPtx(int major, int minor) const
+    {
+        return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
+    }
+
+    bool CudaArch::hasBin(int major, int minor) const
+    {
+        return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
+    }
+
+    bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.front() <= major * 10 + minor);
+    }
+
+    bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.back() >= major * 10 + minor);
+    }
+
+    bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
+    {
+        return !bin.empty() && (bin.back() >= major * 10 + minor);
+    }
+
+    void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
     {
         if (set_as_str.find_first_not_of(" ") == string::npos)
-            return false;
+            return;
 
-        std::stringstream stream(set_as_str);
+        istringstream stream(set_as_str);
         int cur_value;
 
         while (!stream.eof())
         {
             stream >> cur_value;
-            if (cmp(cur_value, value))
-                return true;
+            arr.push_back(cur_value);
         }
 
-        return false;
+        sort(arr.begin(), arr.end());
     }
 }
 
 bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_FEATURES, feature_set, std::greater_equal<int>());
+    return cudaArch.builtWith(feature_set);
 #else
     (void)feature_set;
     return false;
@@ -110,7 +165,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor)
 bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::equal_to<int>());
+    return cudaArch.hasPtx(major, minor);
 #else
     (void)major;
     (void)minor;
@@ -121,7 +176,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
 bool cv::gpu::TargetArchs::hasBin(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, std::equal_to<int>());
+    return cudaArch.hasBin(major, minor);
 #else
     (void)major;
     (void)minor;
@@ -132,8 +187,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor)
 bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor,
-                     std::less_equal<int>());
+    return cudaArch.hasEqualOrLessPtx(major, minor);
 #else
     (void)major;
     (void)minor;
@@ -143,14 +197,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
 
 bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
 {
-    return hasEqualOrGreaterPtx(major, minor) ||
-           hasEqualOrGreaterBin(major, minor);
+    return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
 }
 
 bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::greater_equal<int>());
+    return cudaArch.hasEqualOrGreaterPtx(major, minor);
 #else
     (void)major;
     (void)minor;
@@ -161,8 +214,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
 bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor,
-                     std::greater_equal<int>());
+    return cudaArch.hasEqualOrGreaterBin(major, minor);
 #else
     (void)major;
     (void)minor;
@@ -170,6 +222,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
 #endif
 }
 
+bool cv::gpu::deviceSupports(FeatureSet feature_set)
+{
+    static int versions[] =
+    {
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+    };
+    static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
+
+    const int devId = getDevice();
+
+    int version;
+
+    if (devId < cache_size && versions[devId] >= 0)
+        version = versions[devId];
+    else
+    {
+        DeviceInfo dev(devId);
+        version = dev.majorVersion() * 10 + dev.minorVersion();
+        if (devId < cache_size)
+            versions[devId] = version;
+    }
+
+    return TargetArchs::builtWith(feature_set) && (version >= feature_set);
+}
+
 #if !defined (HAVE_CUDA)
 
 #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
@@ -316,18 +393,6 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory)
 
 namespace
 {
-    template <class T> void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
-    {
-        *attribute = T();
-        //CUresult error = CUDA_SUCCESS;// = cuDeviceGetAttribute( attribute, device_attribute, device ); why link erros under ubuntu??
-        CUresult error = cuDeviceGetAttribute( attribute, device_attribute, device );
-        if( CUDA_SUCCESS == error )
-            return;
-
-        printf("Driver API error = %04d\n", error);
-        cv::gpu::error("driver API error", __FILE__, __LINE__);
-    }
-
     int convertSMVer2Cores(int major, int minor)
     {
         // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
@@ -336,7 +401,7 @@ namespace
             int Cores;
         } SMtoCores;
 
-        SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, { -1, -1 }  };
+        SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
 
         int index = 0;
         while (gpuArchCoresPerSM[index].SM != -1)
@@ -345,7 +410,7 @@ namespace
                 return gpuArchCoresPerSM[index].Cores;
             index++;
         }
-        printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
+
         return -1;
     }
 }
@@ -383,22 +448,13 @@ void cv::gpu::printCudaDeviceInfo(int device)
         printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
         printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
         printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
-        printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n",
-            prop.multiProcessorCount, convertSMVer2Cores(prop.major, prop.minor),
-            convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);
+
+        int cores = convertSMVer2Cores(prop.major, prop.minor);
+        if (cores > 0)
+            printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
+
         printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
 
-        // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
-        int memoryClock, memBusWidth, L2CacheSize;
-        getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );
-        getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev );
-        getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev );
-
-        printf("  Memory Clock rate:                             %.2f Mhz\n", memoryClock * 1e-3f);
-        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
-        if (L2CacheSize)
-            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
-
         printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
             prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
             prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
@@ -458,7 +514,12 @@ void cv::gpu::printShortCudaDeviceInfo(int device)
 
         const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
         printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
-        printf(", sm_%d%d%s, %d cores", prop.major, prop.minor, arch_str, convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);
+        printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
+
+        int cores = convertSMVer2Cores(prop.major, prop.minor);
+        if (cores > 0)
+            printf(", %d cores", cores * prop.multiProcessorCount);
+
         printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
     }
     fflush(stdout);
@@ -704,6 +765,43 @@ cv::Mat::Mat(const GpuMat& m) : flags(0), dims(0), rows(0), cols(0), data(0), re
     m.download(*this);
 }
 
+void cv::gpu::createContinuous(int rows, int cols, int type, GpuMat& m)
+{
+    int area = rows * cols;
+    if (m.empty() || m.type() != type || !m.isContinuous() || m.size().area() < area)
+        m.create(1, area, type);
+
+    m.cols = cols;
+    m.rows = rows;
+    m.step = m.elemSize() * cols;
+    m.flags |= Mat::CONTINUOUS_FLAG;
+}
+
+void cv::gpu::ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
+{
+    if (m.empty() || m.type() != type || m.data != m.datastart)
+        m.create(rows, cols, type);
+    else
+    {
+        const size_t esz = m.elemSize();
+        const ptrdiff_t delta2 = m.dataend - m.datastart;
+
+        const size_t minstep = m.cols * esz;
+
+        Size wholeSize;
+        wholeSize.height = std::max(static_cast<int>((delta2 - minstep) / m.step + 1), m.rows);
+        wholeSize.width = std::max(static_cast<int>((delta2 - m.step * (wholeSize.height - 1)) / esz), m.cols);
+
+        if (wholeSize.height < rows || wholeSize.width < cols)
+            m.create(rows, cols, type);
+        else
+        {
+            m.cols = cols;
+            m.rows = rows;
+        }
+    }
+}
+
 namespace
 {
     class GpuFuncTable
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 42c287593f..0776ca6248 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -922,8 +922,8 @@ _InputArray::_InputArray(const Mat& m) : flags(MAT), obj((void*)&m) {}
 _InputArray::_InputArray(const vector<Mat>& vec) : flags(STD_VECTOR_MAT), obj((void*)&vec) {}
 _InputArray::_InputArray(const double& val) : flags(FIXED_TYPE + FIXED_SIZE + MATX + CV_64F), obj((void*)&val), sz(Size(1,1)) {}
 _InputArray::_InputArray(const MatExpr& expr) : flags(FIXED_TYPE + FIXED_SIZE + EXPR), obj((void*)&expr) {}
-_InputArray::_InputArray(const GlBuffer& buf) : flags(FIXED_TYPE + FIXED_SIZE + OPENGL_BUFFER), obj((void*)&buf) {}
-_InputArray::_InputArray(const GlTexture& tex) : flags(FIXED_TYPE + FIXED_SIZE + OPENGL_TEXTURE), obj((void*)&tex) {}
+_InputArray::_InputArray(const GlBuffer& buf) : flags(OPENGL_BUFFER), obj((void*)&buf) {}
+_InputArray::_InputArray(const GlTexture2D &tex) : flags(OPENGL_TEXTURE2D), obj((void*)&tex) {}
 _InputArray::_InputArray(const gpu::GpuMat& d_mat) : flags(GPU_MAT), obj((void*)&d_mat) {}
 
 Mat _InputArray::getMat(int i) const
@@ -1076,14 +1076,14 @@ GlBuffer _InputArray::getGlBuffer() const
     }
 }
 
-GlTexture _InputArray::getGlTexture() const
+GlTexture2D _InputArray::getGlTexture2D() const
 {
     int k = kind();
 
-    CV_Assert(k == OPENGL_TEXTURE);
+    CV_Assert(k == OPENGL_TEXTURE2D);
     //if( k == OPENGL_TEXTURE )
     {
-        const GlTexture* tex = (const GlTexture*)obj;
+        const GlTexture2D* tex = (const GlTexture2D*)obj;
         return *tex;
     }
 }
@@ -1168,10 +1168,10 @@ Size _InputArray::size(int i) const
         return buf->size();
     }
 
-    if( k == OPENGL_TEXTURE )
+    if( k == OPENGL_TEXTURE2D )
     {
         CV_Assert( i < 0 );
-        const GlTexture* tex = (const GlTexture*)obj;
+        const GlTexture2D* tex = (const GlTexture2D*)obj;
         return tex->size();
     }
 
@@ -1216,9 +1216,6 @@ int _InputArray::type(int i) const
     if( k == OPENGL_BUFFER )
         return ((const GlBuffer*)obj)->type();
 
-    if( k == OPENGL_TEXTURE )
-        return ((const GlTexture*)obj)->type();
-
     CV_Assert( k == GPU_MAT );
     //if( k == GPU_MAT )
         return ((const gpu::GpuMat*)obj)->type();
@@ -1271,8 +1268,8 @@ bool _InputArray::empty() const
     if( k == OPENGL_BUFFER )
         return ((const GlBuffer*)obj)->empty();
 
-    if( k == OPENGL_TEXTURE )
-        return ((const GlTexture*)obj)->empty();
+    if( k == OPENGL_TEXTURE2D )
+        return ((const GlTexture2D*)obj)->empty();
 
     CV_Assert( k == GPU_MAT );
     //if( k == GPU_MAT )
@@ -1285,10 +1282,14 @@ _OutputArray::~_OutputArray() {}
 _OutputArray::_OutputArray(Mat& m) : _InputArray(m) {}
 _OutputArray::_OutputArray(vector<Mat>& vec) : _InputArray(vec) {}
 _OutputArray::_OutputArray(gpu::GpuMat& d_mat) : _InputArray(d_mat) {}
+_OutputArray::_OutputArray(GlBuffer& buf) : _InputArray(buf) {}
+_OutputArray::_OutputArray(GlTexture2D& tex) : _InputArray(tex) {}
 
 _OutputArray::_OutputArray(const Mat& m) : _InputArray(m) {flags |= FIXED_SIZE|FIXED_TYPE;}
 _OutputArray::_OutputArray(const vector<Mat>& vec) : _InputArray(vec) {flags |= FIXED_SIZE;}
 _OutputArray::_OutputArray(const gpu::GpuMat& d_mat) : _InputArray(d_mat) {flags |= FIXED_SIZE|FIXED_TYPE;}
+_OutputArray::_OutputArray(const GlBuffer& buf) : _InputArray(buf) {flags |= FIXED_SIZE|FIXED_TYPE;}
+_OutputArray::_OutputArray(const GlTexture2D& tex) : _InputArray(tex) {flags |= FIXED_SIZE|FIXED_TYPE;}
 
 
 bool _OutputArray::fixedSize() const
@@ -1318,6 +1319,13 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
         ((gpu::GpuMat*)obj)->create(_sz, mtype);
         return;
     }
+    if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
+    {
+        CV_Assert(!fixedSize() || ((GlBuffer*)obj)->size() == _sz);
+        CV_Assert(!fixedType() || ((GlBuffer*)obj)->type() == mtype);
+        ((GlBuffer*)obj)->create(_sz, mtype);
+        return;
+    }
     int sizes[] = {_sz.height, _sz.width};
     create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
 }
@@ -1339,6 +1347,13 @@ void _OutputArray::create(int rows, int cols, int mtype, int i, bool allowTransp
         ((gpu::GpuMat*)obj)->create(rows, cols, mtype);
         return;
     }
+    if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
+    {
+        CV_Assert(!fixedSize() || ((GlBuffer*)obj)->size() == Size(cols, rows));
+        CV_Assert(!fixedType() || ((GlBuffer*)obj)->type() == mtype);
+        ((GlBuffer*)obj)->create(rows, cols, mtype);
+        return;
+    }
     int sizes[] = {rows, cols};
     create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
 }
@@ -1558,6 +1573,18 @@ void _OutputArray::release() const
         return;
     }
 
+    if( k == OPENGL_BUFFER )
+    {
+        ((GlBuffer*)obj)->release();
+        return;
+    }
+
+    if( k == OPENGL_TEXTURE2D )
+    {
+        ((GlTexture2D*)obj)->release();
+        return;
+    }
+
     if( k == NONE )
         return;
 
@@ -1623,6 +1650,20 @@ gpu::GpuMat& _OutputArray::getGpuMatRef() const
     return *(gpu::GpuMat*)obj;
 }
 
+GlBuffer& _OutputArray::getGlBufferRef() const
+{
+    int k = kind();
+    CV_Assert( k == OPENGL_BUFFER );
+    return *(GlBuffer*)obj;
+}
+
+GlTexture2D& _OutputArray::getGlTexture2DRef() const
+{
+    int k = kind();
+    CV_Assert( k == OPENGL_TEXTURE2D );
+    return *(GlTexture2D*)obj;
+}
+
 static _OutputArray _none;
 OutputArray noArray() { return _none; }
 
diff --git a/modules/core/src/opengl_interop.cpp b/modules/core/src/opengl_interop.cpp
index 12589b7ba3..befc63f3f7 100644
--- a/modules/core/src/opengl_interop.cpp
+++ b/modules/core/src/opengl_interop.cpp
@@ -41,26 +41,11 @@
 //M*/
 
 #include "precomp.hpp"
-#include <iostream>
 #include "opencv2/core/opengl_interop.hpp"
 #include "opencv2/core/gpumat.hpp"
 
-#if defined WIN32 || defined _WIN32 || defined WINCE
-#include <windows.h>
-#undef small
-#undef min
-#undef max
-#undef abs
-#endif
-
 #ifdef HAVE_OPENGL
-    #ifdef __APPLE__
-        #include <OpenGL/gl.h>
-        #include <OpenGL/glu.h>
-    #else
-        #include <GL/gl.h>
-        #include <GL/glu.h>
-    #endif
+    #include "gl_core_3_1.hpp"
 
     #ifdef HAVE_CUDA
         #include <cuda_runtime.h>
@@ -72,213 +57,258 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
-#ifndef HAVE_OPENGL
-    #define throw_nogl CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support")
-    #define throw_nocuda CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
-#else
-    #define throw_nogl CV_Error(CV_OpenGlNotSupported, "OpenGL context doesn't exist")
-
-    #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-        #define throw_nocuda CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
+namespace
+{
+    #ifndef HAVE_OPENGL
+        void throw_nogl() { CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support"); }
     #else
-        #if defined(__GNUC__)
-            #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
-        #else /* defined(__CUDACC__) || defined(__MSVC__) */
-            #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
-        #endif
+        void throw_nogl() { CV_Error(CV_OpenGlApiCallError, "OpenGL context doesn't exist"); }
 
-        namespace
-        {
-            inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+        #ifndef HAVE_CUDA
+            void throw_nocuda() { CV_Error(CV_GpuNotSupported, "The library is compiled without GPU support"); }
+        #else
+            void throw_nocuda() { CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform"); }
+
+            #if defined(__GNUC__)
+                #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+            #else /* defined(__CUDACC__) || defined(__MSVC__) */
+                #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+            #endif
+
+            void ___cudaSafeCall(cudaError_t err, const char* file, const int line, const char* func = "")
             {
                 if (cudaSuccess != err)
                     cv::gpu::error(cudaGetErrorString(err), file, line, func);
             }
-        }
-    #endif // HAVE_CUDA
-#endif
+        #endif
+    #endif
+}
 
-namespace
+bool cv::checkGlError(const char* file, const int line, const char* func)
 {
-    class EmptyGlFuncTab : public CvOpenGlFuncTab
+#ifndef HAVE_OPENGL
+    (void) file;
+    (void) line;
+    (void) func;
+    return true;
+#else
+    GLenum err = gl::GetError();
+
+    if (err != gl::NO_ERROR_)
     {
-    public:
-        void genBuffers(int, unsigned int*) const { throw_nogl; }
-        void deleteBuffers(int, const unsigned int*) const { throw_nogl; }
+        const char* msg;
 
-        void bufferData(unsigned int, ptrdiff_t, const void*, unsigned int) const { throw_nogl; }
-        void bufferSubData(unsigned int, ptrdiff_t, ptrdiff_t, const void*) const { throw_nogl; }
+        switch (err)
+        {
+        case gl::INVALID_ENUM:
+            msg = "An unacceptable value is specified for an enumerated argument";
+            break;
 
-        void bindBuffer(unsigned int, unsigned int) const { throw_nogl; }
+        case gl::INVALID_VALUE:
+            msg = "A numeric argument is out of range";
+            break;
 
-        void* mapBuffer(unsigned int, unsigned int) const { throw_nogl; return 0; }
-        void unmapBuffer(unsigned int) const { throw_nogl; }
+        case gl::INVALID_OPERATION:
+            msg = "The specified operation is not allowed in the current state";
+            break;
 
-        void generateBitmapFont(const std::string&, int, int, bool, bool, int, int, int) const { throw_nogl; }
+        case gl::OUT_OF_MEMORY:
+            msg = "There is not enough memory left to execute the command";
+            break;
 
-        bool isGlContextInitialized() const { return false; }
-    };
+        default:
+            msg = "Unknown error";
+        };
 
-    const CvOpenGlFuncTab* g_glFuncTab = 0;
+        cvError(CV_OpenGlApiCallError, func, msg, file, line);
 
-#if defined HAVE_CUDA || defined HAVE_OPENGL
-    const CvOpenGlFuncTab* glFuncTab()
-    {
-        static EmptyGlFuncTab empty;
-        return g_glFuncTab ? g_glFuncTab : &empty;
+        return false;
     }
+
+    return true;
 #endif
 }
 
-CvOpenGlFuncTab::~CvOpenGlFuncTab()
-{
-    if (g_glFuncTab == this)
-        g_glFuncTab = 0;
-}
-
-void icvSetOpenGlFuncTab(const CvOpenGlFuncTab* tab)
-{
-    g_glFuncTab = tab;
-}
-
 #ifdef HAVE_OPENGL
-    #ifndef GL_DYNAMIC_DRAW
-        #define GL_DYNAMIC_DRAW 0x88E8
-    #endif
+namespace
+{
+    const GLenum gl_types[] = { gl::UNSIGNED_BYTE, gl::BYTE, gl::UNSIGNED_SHORT, gl::SHORT, gl::INT, gl::FLOAT, gl::DOUBLE };
+}
+#endif
 
-    #ifndef GL_READ_WRITE
-        #define GL_READ_WRITE 0x88BA
-    #endif
-
-    #ifndef GL_BGR
-        #define GL_BGR 0x80E0
-    #endif
-
-    #ifndef GL_BGRA
-        #define GL_BGRA 0x80E1
-    #endif
-
-    namespace
-    {
-        const GLenum gl_types[] = {GL_UNSIGNED_BYTE, GL_BYTE, GL_UNSIGNED_SHORT, GL_SHORT, GL_INT, GL_FLOAT, GL_DOUBLE};
-
-    #ifdef HAVE_CUDA
-        bool g_isCudaGlDeviceInitialized = false;
-    #endif
-    }
-#endif // HAVE_OPENGL
+////////////////////////////////////////////////////////////////////////
+// setGlDevice
 
 void cv::gpu::setGlDevice(int device)
 {
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-    (void)device;
-    throw_nocuda;
+#ifndef HAVE_OPENGL
+    (void) device;
+    throw_nogl();
 #else
-    #ifndef HAVE_OPENGL
-        (void)device;
-        throw_nogl;
+    #if !defined(HAVE_CUDA) || defined(CUDA_DISABLER)
+        (void) device;
+        throw_nocuda();
     #else
-        if (!glFuncTab()->isGlContextInitialized())
-            throw_nogl;
-
         cudaSafeCall( cudaGLSetGLDevice(device) );
-
-        g_isCudaGlDeviceInitialized = true;
     #endif
 #endif
 }
 
 ////////////////////////////////////////////////////////////////////////
-// CudaGlInterop
+// CudaResource
+
+#if defined(HAVE_OPENGL) && defined(HAVE_CUDA) && !defined(CUDA_DISABLER)
 
-#if defined HAVE_CUDA && defined HAVE_OPENGL
 namespace
 {
-    class CudaGlInterop
+    class CudaResource
     {
     public:
-        CudaGlInterop();
-        ~CudaGlInterop();
+        CudaResource();
+        ~CudaResource();
 
-        void registerBuffer(unsigned int buffer);
+        void registerBuffer(GLuint buffer);
+        void release();
 
-        void copyFrom(const GpuMat& mat, cudaStream_t stream = 0);
+        void copyFrom(const void* src, size_t spitch, size_t width, size_t height, cudaStream_t stream = 0);
+        void copyTo(void* dst, size_t dpitch, size_t width, size_t height, cudaStream_t stream = 0);
 
-        GpuMat map(int rows, int cols, int type, cudaStream_t stream = 0);
+        void* map(cudaStream_t stream = 0);
         void unmap(cudaStream_t stream = 0);
 
     private:
         cudaGraphicsResource_t resource_;
+        GLuint buffer_;
+
+        class GraphicsMapHolder;
     };
 
-    inline CudaGlInterop::CudaGlInterop() : resource_(0)
+    CudaResource::CudaResource() : resource_(0), buffer_(0)
     {
     }
 
-    CudaGlInterop::~CudaGlInterop()
+    CudaResource::~CudaResource()
     {
-        if (resource_)
-        {
-            cudaGraphicsUnregisterResource(resource_);
-            resource_ = 0;
-        }
+        release();
     }
 
-    void CudaGlInterop::registerBuffer(unsigned int buffer)
+    void CudaResource::registerBuffer(GLuint buffer)
     {
-        if (!g_isCudaGlDeviceInitialized)
-            cvError(CV_GpuApiCallError, "registerBuffer", "cuda GL device wasn't initialized, call setGlDevice", __FILE__, __LINE__);
+        CV_DbgAssert( buffer != 0 );
+
+        if (buffer_ == buffer)
+            return;
 
         cudaGraphicsResource_t resource;
         cudaSafeCall( cudaGraphicsGLRegisterBuffer(&resource, buffer, cudaGraphicsMapFlagsNone) );
 
+        release();
+
         resource_ = resource;
+        buffer_ = buffer;
     }
 
-    void CudaGlInterop::copyFrom(const GpuMat& mat, cudaStream_t stream)
+    void CudaResource::release()
     {
-        CV_Assert(resource_ != 0);
+        if (resource_)
+            cudaGraphicsUnregisterResource(resource_);
 
-        cudaSafeCall( cudaGraphicsMapResources(1, &resource_, stream) );
+        resource_ = 0;
+        buffer_ = 0;
+    }
 
-        void* dst_ptr;
-        size_t num_bytes;
-        cudaSafeCall( cudaGraphicsResourceGetMappedPointer(&dst_ptr, &num_bytes, resource_) );
+    class CudaResource::GraphicsMapHolder
+    {
+    public:
+        GraphicsMapHolder(cudaGraphicsResource_t* resource, cudaStream_t stream);
+        ~GraphicsMapHolder();
 
-        const void* src_ptr = mat.ptr();
-        size_t widthBytes = mat.cols * mat.elemSize();
+        void reset();
 
-        CV_Assert(widthBytes * mat.rows <= num_bytes);
+    private:
+        cudaGraphicsResource_t* resource_;
+        cudaStream_t stream_;
+    };
+
+    CudaResource::GraphicsMapHolder::GraphicsMapHolder(cudaGraphicsResource_t* resource, cudaStream_t stream) : resource_(resource), stream_(stream)
+    {
+        if (resource_)
+            cudaSafeCall( cudaGraphicsMapResources(1, resource_, stream_) );
+    }
+
+    CudaResource::GraphicsMapHolder::~GraphicsMapHolder()
+    {
+        if (resource_)
+            cudaGraphicsUnmapResources(1, resource_, stream_);
+    }
+
+    void CudaResource::GraphicsMapHolder::reset()
+    {
+        resource_ = 0;
+    }
+
+    void CudaResource::copyFrom(const void* src, size_t spitch, size_t width, size_t height, cudaStream_t stream)
+    {
+        CV_DbgAssert( resource_ != 0 );
+
+        GraphicsMapHolder h(&resource_, stream);
+        (void) h;
+
+        void* dst;
+        size_t size;
+        cudaSafeCall( cudaGraphicsResourceGetMappedPointer(&dst, &size, resource_) );
+
+        CV_DbgAssert( width * height == size );
 
         if (stream == 0)
-            cudaSafeCall( cudaMemcpy2D(dst_ptr, widthBytes, src_ptr, mat.step, widthBytes, mat.rows, cudaMemcpyDeviceToDevice) );
+            cudaSafeCall( cudaMemcpy2D(dst, width, src, spitch, width, height, cudaMemcpyDeviceToDevice) );
         else
-            cudaSafeCall( cudaMemcpy2DAsync(dst_ptr, widthBytes, src_ptr, mat.step, widthBytes, mat.rows, cudaMemcpyDeviceToDevice, stream) );
-
-        cudaGraphicsUnmapResources(1, &resource_, stream);
+            cudaSafeCall( cudaMemcpy2DAsync(dst, width, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream) );
     }
 
-    GpuMat CudaGlInterop::map(int rows, int cols, int type, cudaStream_t stream)
+    void CudaResource::copyTo(void* dst, size_t dpitch, size_t width, size_t height, cudaStream_t stream)
     {
-        CV_Assert(resource_ != 0);
+        CV_DbgAssert( resource_ != 0 );
 
-        cudaSafeCall( cudaGraphicsMapResources(1, &resource_, stream) );
+        GraphicsMapHolder h(&resource_, stream);
+        (void) h;
+
+        void* src;
+        size_t size;
+        cudaSafeCall( cudaGraphicsResourceGetMappedPointer(&src, &size, resource_) );
+
+        CV_DbgAssert( width * height == size );
+
+        if (stream == 0)
+            cudaSafeCall( cudaMemcpy2D(dst, dpitch, src, width, width, height, cudaMemcpyDeviceToDevice) );
+        else
+            cudaSafeCall( cudaMemcpy2DAsync(dst, dpitch, src, width, width, height, cudaMemcpyDeviceToDevice, stream) );
+    }
+
+    void* CudaResource::map(cudaStream_t stream)
+    {
+        CV_DbgAssert( resource_ != 0 );
+
+        GraphicsMapHolder h(&resource_, stream);
 
         void* ptr;
-        size_t num_bytes;
-        cudaSafeCall( cudaGraphicsResourceGetMappedPointer(&ptr, &num_bytes, resource_) );
+        size_t size;
+        cudaSafeCall( cudaGraphicsResourceGetMappedPointer(&ptr, &size, resource_) );
 
-        CV_Assert( static_cast<size_t>(cols) * CV_ELEM_SIZE(type) * rows <= num_bytes );
+        h.reset();
 
-        return GpuMat(rows, cols, type, ptr);
+        return ptr;
     }
 
-    inline void CudaGlInterop::unmap(cudaStream_t stream)
+    void CudaResource::unmap(cudaStream_t stream)
     {
+        CV_Assert( resource_ != 0 );
+
         cudaGraphicsUnmapResources(1, &resource_, stream);
     }
 }
-#endif // HAVE_CUDA && HAVE_OPENGL
+
+#endif
 
 ////////////////////////////////////////////////////////////////////////
 // GlBuffer
@@ -296,393 +326,466 @@ class cv::GlBuffer::Impl
 public:
     static const Ptr<Impl>& empty();
 
-    Impl(int rows, int cols, int type, unsigned int target);
-    Impl(const Mat& m, unsigned int target);
+    Impl(GLuint bufId, bool autoRelease);
+    Impl(GLsizeiptr size, const GLvoid* data, GLenum target, bool autoRelease);
     ~Impl();
 
-    void copyFrom(const Mat& m, unsigned int target);
+    void bind(GLenum target) const;
+
+    void copyFrom(GLuint srcBuf, GLsizeiptr size);
+
+    void copyFrom(GLsizeiptr size, const GLvoid* data);
+    void copyTo(GLsizeiptr size, GLvoid* data) const;
+
+    void* mapHost(GLenum access);
+    void unmapHost();
 
 #ifdef HAVE_CUDA
-    void copyFrom(const GpuMat& mat, cudaStream_t stream = 0);
-#endif
+    void copyFrom(const void* src, size_t spitch, size_t width, size_t height, cudaStream_t stream = 0);
+    void copyTo(void* dst, size_t dpitch, size_t width, size_t height, cudaStream_t stream = 0) const;
 
-    void bind(unsigned int target) const;
-    void unbind(unsigned int target) const;
-
-    Mat mapHost(int rows, int cols, int type, unsigned int target);
-    void unmapHost(unsigned int target);
-
-#ifdef HAVE_CUDA
-    GpuMat mapDevice(int rows, int cols, int type, cudaStream_t stream = 0);
+    void* mapDevice(cudaStream_t stream = 0);
     void unmapDevice(cudaStream_t stream = 0);
 #endif
 
+    void setAutoRelease(bool flag) { autoRelease_ = flag; }
+
+    GLuint bufId() const { return bufId_; }
+
 private:
     Impl();
 
-    unsigned int buffer_;
+    GLuint bufId_;
+    bool autoRelease_;
 
 #ifdef HAVE_CUDA
-    CudaGlInterop cudaGlInterop_;
+    mutable CudaResource cudaResource_;
 #endif
 };
 
-inline const Ptr<cv::GlBuffer::Impl>& cv::GlBuffer::Impl::empty()
+const Ptr<cv::GlBuffer::Impl>& cv::GlBuffer::Impl::empty()
 {
     static Ptr<Impl> p(new Impl);
     return p;
 }
 
-inline cv::GlBuffer::Impl::Impl() : buffer_(0)
+cv::GlBuffer::Impl::Impl() : bufId_(0), autoRelease_(true)
 {
 }
 
-cv::GlBuffer::Impl::Impl(int rows, int cols, int type, unsigned int target) : buffer_(0)
+cv::GlBuffer::Impl::Impl(GLuint abufId, bool autoRelease) : bufId_(abufId), autoRelease_(autoRelease)
 {
-    if (!glFuncTab()->isGlContextInitialized())
-        throw_nogl;
-
-    CV_DbgAssert(rows > 0 && cols > 0);
-    CV_DbgAssert(CV_MAT_DEPTH(type) >= 0 && CV_MAT_DEPTH(type) <= CV_64F);
-
-    glFuncTab()->genBuffers(1, &buffer_);
-    CV_CheckGlError();
-    CV_Assert(buffer_ != 0);
-
-    size_t size = rows * cols * CV_ELEM_SIZE(type);
-
-    glFuncTab()->bindBuffer(target, buffer_);
-    CV_CheckGlError();
-
-    glFuncTab()->bufferData(target, size, 0, GL_DYNAMIC_DRAW);
-    CV_CheckGlError();
-
-    glFuncTab()->bindBuffer(target, 0);
-
-#ifdef HAVE_CUDA
-    if (g_isCudaGlDeviceInitialized)
-        cudaGlInterop_.registerBuffer(buffer_);
-#endif
 }
 
-cv::GlBuffer::Impl::Impl(const Mat& m, unsigned int target) : buffer_(0)
+cv::GlBuffer::Impl::Impl(GLsizeiptr size, const GLvoid* data, GLenum target, bool autoRelease) : bufId_(0), autoRelease_(autoRelease)
 {
-    if (!glFuncTab()->isGlContextInitialized())
-        throw_nogl;
-
-    CV_DbgAssert(m.rows > 0 && m.cols > 0);
-    CV_DbgAssert(m.depth() >= 0 && m.depth() <= CV_64F);
-    CV_Assert(m.isContinuous());
-
-    glFuncTab()->genBuffers(1, &buffer_);
-    CV_CheckGlError();
-    CV_Assert(buffer_ != 0);
-
-    size_t size = m.rows * m.cols * m.elemSize();
-
-    glFuncTab()->bindBuffer(target, buffer_);
+    gl::GenBuffers(1, &bufId_);
     CV_CheckGlError();
 
-    glFuncTab()->bufferData(target, size, m.data, GL_DYNAMIC_DRAW);
+    CV_Assert( bufId_ != 0 );
+
+    gl::BindBuffer(target, bufId_);
     CV_CheckGlError();
 
-    glFuncTab()->bindBuffer(target, 0);
+    gl::BufferData(target, size, data, gl::DYNAMIC_DRAW);
+    CV_CheckGlError();
 
-#ifdef HAVE_CUDA
-    if (g_isCudaGlDeviceInitialized)
-        cudaGlInterop_.registerBuffer(buffer_);
-#endif
+    gl::BindBuffer(target, 0);
+    CV_CheckGlError();
 }
 
 cv::GlBuffer::Impl::~Impl()
 {
-    try
+    if (autoRelease_ && bufId_)
+        gl::DeleteBuffers(1, &bufId_);
+}
+
+void cv::GlBuffer::Impl::bind(GLenum target) const
+{
+    gl::BindBuffer(target, bufId_);
+    CV_CheckGlError();
+}
+
+void cv::GlBuffer::Impl::copyFrom(GLuint srcBuf, GLsizeiptr size)
+{
+    gl::BindBuffer(gl::COPY_WRITE_BUFFER, bufId_);
+    CV_CheckGlError();
+
+    gl::BindBuffer(gl::COPY_READ_BUFFER, srcBuf);
+    CV_CheckGlError();
+
+    gl::CopyBufferSubData(gl::COPY_READ_BUFFER, gl::COPY_WRITE_BUFFER, 0, 0, size);
+    CV_CheckGlError();
+}
+
+void cv::GlBuffer::Impl::copyFrom(GLsizeiptr size, const GLvoid* data)
+{
+    gl::BindBuffer(gl::COPY_WRITE_BUFFER, bufId_);
+    CV_CheckGlError();
+
+    gl::BufferSubData(gl::COPY_WRITE_BUFFER, 0, size, data);
+    CV_CheckGlError();
+}
+
+void cv::GlBuffer::Impl::copyTo(GLsizeiptr size, GLvoid* data) const
+{
+    gl::BindBuffer(gl::COPY_READ_BUFFER, bufId_);
+    CV_CheckGlError();
+
+    gl::GetBufferSubData(gl::COPY_READ_BUFFER, 0, size, data);
+    CV_CheckGlError();
+}
+
+void* cv::GlBuffer::Impl::mapHost(GLenum access)
+{
+    gl::BindBuffer(gl::COPY_READ_BUFFER, bufId_);
+    CV_CheckGlError();
+
+    GLvoid* data = gl::MapBuffer(gl::COPY_READ_BUFFER, access);
+    CV_CheckGlError();
+
+    return data;
+}
+
+void cv::GlBuffer::Impl::unmapHost()
+{
+    gl::UnmapBuffer(gl::COPY_READ_BUFFER);
+}
+
+#ifdef HAVE_CUDA
+    void cv::GlBuffer::Impl::copyFrom(const void* src, size_t spitch, size_t width, size_t height, cudaStream_t stream)
     {
-        if (buffer_)
-            glFuncTab()->deleteBuffers(1, &buffer_);
+        cudaResource_.registerBuffer(bufId_);
+        cudaResource_.copyFrom(src, spitch, width, height, stream);
     }
-#ifdef _DEBUG
-    catch(const exception& e)
+
+    void cv::GlBuffer::Impl::copyTo(void* dst, size_t dpitch, size_t width, size_t height, cudaStream_t stream) const
     {
-        cerr << e.what() << endl;
+        cudaResource_.registerBuffer(bufId_);
+        cudaResource_.copyTo(dst, dpitch, width, height, stream);
+    }
+
+    void* cv::GlBuffer::Impl::mapDevice(cudaStream_t stream)
+    {
+        cudaResource_.registerBuffer(bufId_);
+        return cudaResource_.map(stream);
+    }
+
+    void cv::GlBuffer::Impl::unmapDevice(cudaStream_t stream)
+    {
+        cudaResource_.unmap(stream);
     }
 #endif
-    catch(...)
-    {
-    }
-}
-
-void cv::GlBuffer::Impl::copyFrom(const Mat& m, unsigned int target)
-{
-    CV_Assert(buffer_ != 0);
-
-    CV_Assert(m.isContinuous());
-
-    bind(target);
-
-    size_t size = m.rows * m.cols * m.elemSize();
-
-    glFuncTab()->bufferSubData(target, 0, size, m.data);
-    CV_CheckGlError();
-
-    unbind(target);
-}
-
-#ifdef HAVE_CUDA
-
-void cv::GlBuffer::Impl::copyFrom(const GpuMat& mat, cudaStream_t stream)
-{
-    if (!g_isCudaGlDeviceInitialized)
-        cvError(CV_GpuApiCallError, "copyFrom", "cuda GL device wasn't initialized, call setGlDevice", __FILE__, __LINE__);
-
-    CV_Assert(buffer_ != 0);
-
-    cudaGlInterop_.copyFrom(mat, stream);
-}
-
-#endif // HAVE_CUDA
-
-inline void cv::GlBuffer::Impl::bind(unsigned int target) const
-{
-    CV_Assert(buffer_ != 0);
-
-    glFuncTab()->bindBuffer(target, buffer_);
-    CV_CheckGlError();
-}
-
-inline void cv::GlBuffer::Impl::unbind(unsigned int target) const
-{
-    glFuncTab()->bindBuffer(target, 0);
-}
-
-inline Mat cv::GlBuffer::Impl::mapHost(int rows, int cols, int type, unsigned int target)
-{
-    void* ptr = glFuncTab()->mapBuffer(target, GL_READ_WRITE);
-    CV_CheckGlError();
-
-    return Mat(rows, cols, type, ptr);
-}
-
-inline void cv::GlBuffer::Impl::unmapHost(unsigned int target)
-{
-    glFuncTab()->unmapBuffer(target);
-}
-
-#ifdef HAVE_CUDA
-
-inline GpuMat cv::GlBuffer::Impl::mapDevice(int rows, int cols, int type, cudaStream_t stream)
-{
-    if (!g_isCudaGlDeviceInitialized)
-        cvError(CV_GpuApiCallError, "copyFrom", "cuda GL device wasn't initialized, call setGlDevice", __FILE__, __LINE__);
-
-    CV_Assert(buffer_ != 0);
-
-    return cudaGlInterop_.map(rows, cols, type, stream);
-}
-
-inline void cv::GlBuffer::Impl::unmapDevice(cudaStream_t stream)
-{
-    if (!g_isCudaGlDeviceInitialized)
-        cvError(CV_GpuApiCallError, "copyFrom", "cuda GL device wasn't initialized, call setGlDevice", __FILE__, __LINE__);
-
-    cudaGlInterop_.unmap(stream);
-}
-
-#endif // HAVE_CUDA
 
 #endif // HAVE_OPENGL
 
-cv::GlBuffer::GlBuffer(Usage _usage) : rows_(0), cols_(0), type_(0), usage_(_usage)
+cv::GlBuffer::GlBuffer() : rows_(0), cols_(0), type_(0)
 {
 #ifndef HAVE_OPENGL
-    (void)_usage;
-    throw_nogl;
+    throw_nogl();
 #else
     impl_ = Impl::empty();
 #endif
 }
 
-cv::GlBuffer::GlBuffer(int _rows, int _cols, int _type, Usage _usage) : rows_(0), cols_(0), type_(0), usage_(_usage)
+cv::GlBuffer::GlBuffer(int arows, int acols, int atype, unsigned int abufId, bool autoRelease) : rows_(0), cols_(0), type_(0)
 {
 #ifndef HAVE_OPENGL
-    (void)_rows;
-    (void)_cols;
-    (void)_type;
-    (void)_usage;
-    throw_nogl;
+    (void) arows;
+    (void) acols;
+    (void) atype;
+    (void) abufId;
+    (void) autoRelease;
+    throw_nogl();
 #else
-    impl_ = new Impl(_rows, _cols, _type, _usage);
-    rows_ = _rows;
-    cols_ = _cols;
-    type_ = _type;
+    impl_ = new Impl(abufId, autoRelease);
+    rows_ = arows;
+    cols_ = acols;
+    type_ = atype;
 #endif
 }
 
-cv::GlBuffer::GlBuffer(Size _size, int _type, Usage _usage) : rows_(0), cols_(0), type_(0), usage_(_usage)
+cv::GlBuffer::GlBuffer(Size asize, int atype, unsigned int abufId, bool autoRelease) : rows_(0), cols_(0), type_(0)
 {
 #ifndef HAVE_OPENGL
-    (void)_size;
-    (void)_type;
-    (void)_usage;
-    throw_nogl;
+    (void) asize;
+    (void) atype;
+    (void) abufId;
+    (void) autoRelease;
+    throw_nogl();
 #else
-    impl_ = new Impl(_size.height, _size.width, _type, _usage);
-    rows_ = _size.height;
-    cols_ = _size.width;
-    type_ = _type;
+    impl_ = new Impl(abufId, autoRelease);
+    rows_ = asize.height;
+    cols_ = asize.width;
+    type_ = atype;
 #endif
 }
 
-cv::GlBuffer::GlBuffer(InputArray mat_, Usage _usage) : rows_(0), cols_(0), type_(0), usage_(_usage)
+cv::GlBuffer::GlBuffer(int arows, int acols, int atype, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0)
+{
+    create(arows, acols, atype, target, autoRelease);
+}
+
+cv::GlBuffer::GlBuffer(Size asize, int atype, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0)
+{
+    create(asize, atype, target, autoRelease);
+}
+
+cv::GlBuffer::GlBuffer(InputArray arr, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0)
 {
 #ifndef HAVE_OPENGL
-    (void)mat_;
-    (void)_usage;
-    throw_nogl;
+    (void) arr;
+    (void) target;
+    (void) autoRelease;
+    throw_nogl();
 #else
-    int kind = mat_.kind();
-    Size _size = mat_.size();
-    int _type = mat_.type();
+    const int kind = arr.kind();
 
-    if (kind == _InputArray::GPU_MAT)
+    switch (kind)
     {
-        #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-            throw_nocuda;
-        #else
-            GpuMat d_mat = mat_.getGpuMat();
-            impl_ = new Impl(d_mat.rows, d_mat.cols, d_mat.type(), _usage);
-            impl_->copyFrom(d_mat);
-        #endif
+    case _InputArray::OPENGL_BUFFER:
+        {
+            copyFrom(arr, target, autoRelease);
+            break;
+        }
+
+    case _InputArray::OPENGL_TEXTURE2D:
+        {
+            copyFrom(arr, target, autoRelease);
+            break;
+        }
+
+    case _InputArray::GPU_MAT:
+        {
+            copyFrom(arr, target, autoRelease);
+            break;
+        }
+
+    default:
+        {
+            Mat mat = arr.getMat();
+            CV_Assert( mat.isContinuous() );
+            const GLsizeiptr asize = mat.rows * mat.cols * mat.elemSize();
+            impl_ = new Impl(asize, mat.data, target, autoRelease);
+            rows_ = mat.rows;
+            cols_ = mat.cols;
+            type_ = mat.type();
+            break;
+        }
     }
-    else
-    {
-        Mat mat = mat_.getMat();
-        impl_ = new Impl(mat, _usage);
-    }
-
-    rows_ = _size.height;
-    cols_ = _size.width;
-    type_ = _type;
 #endif
 }
 
-void cv::GlBuffer::create(int _rows, int _cols, int _type, Usage _usage)
+void cv::GlBuffer::create(int arows, int acols, int atype, Target target, bool autoRelease)
 {
 #ifndef HAVE_OPENGL
-    (void)_rows;
-    (void)_cols;
-    (void)_type;
-    (void)_usage;
-    throw_nogl;
+    (void) arows;
+    (void) acols;
+    (void) atype;
+    (void) target;
+    (void) autoRelease;
+    throw_nogl();
 #else
-    if (rows_ != _rows || cols_ != _cols || type_ != _type || usage_ != _usage)
+    if (rows_ != arows || cols_ != acols || type_ != atype)
     {
-        impl_ = new Impl(_rows, _cols, _type, _usage);
-        rows_ = _rows;
-        cols_ = _cols;
-        type_ = _type;
-        usage_ = _usage;
+        const GLsizeiptr asize = arows * acols * CV_ELEM_SIZE(atype);
+        impl_ = new Impl(asize, 0, target, autoRelease);
+        rows_ = arows;
+        cols_ = acols;
+        type_ = atype;
     }
 #endif
 }
 
 void cv::GlBuffer::release()
 {
-#ifndef HAVE_OPENGL
-    throw_nogl;
-#else
+#ifdef HAVE_OPENGL
+    if (*impl_.refcount == 1)
+        impl_->setAutoRelease(true);
     impl_ = Impl::empty();
+    rows_ = 0;
+    cols_ = 0;
+    type_ = 0;
 #endif
 }
 
-void cv::GlBuffer::copyFrom(InputArray mat_)
+void cv::GlBuffer::setAutoRelease(bool flag)
 {
 #ifndef HAVE_OPENGL
-    (void)mat_;
-    throw_nogl;
+    (void) flag;
+    throw_nogl();
 #else
-    int kind = mat_.kind();
-    Size _size = mat_.size();
-    int _type = mat_.type();
+    impl_->setAutoRelease(flag);
+#endif
+}
 
-    create(_size, _type);
+void cv::GlBuffer::copyFrom(InputArray arr, Target target, bool autoRelease)
+{
+#ifndef HAVE_OPENGL
+    (void) arr;
+    (void) target;
+    (void) autoRelease;
+    throw_nogl();
+#else
+    const int kind = arr.kind();
+
+    if (kind == _InputArray::OPENGL_TEXTURE2D)
+    {
+        GlTexture2D tex = arr.getGlTexture2D();
+        tex.copyTo(*this);
+        setAutoRelease(autoRelease);
+        return;
+    }
+
+    const Size asize = arr.size();
+    const int atype = arr.type();
+    create(asize, atype, target, autoRelease);
 
     switch (kind)
     {
     case _InputArray::OPENGL_BUFFER:
         {
-            GlBuffer buf = mat_.getGlBuffer();
-            *this = buf;
+            GlBuffer buf = arr.getGlBuffer();
+            impl_->copyFrom(buf.bufId(), asize.area() * CV_ELEM_SIZE(atype));
             break;
         }
+
     case _InputArray::GPU_MAT:
         {
             #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-                throw_nocuda;
+                throw_nocuda();
             #else
-                GpuMat d_mat = mat_.getGpuMat();
-                impl_->copyFrom(d_mat);
+                GpuMat dmat = arr.getGpuMat();
+                impl_->copyFrom(dmat.data, dmat.step, dmat.cols * dmat.elemSize(), dmat.rows);
             #endif
 
             break;
         }
+
     default:
         {
-            Mat mat = mat_.getMat();
-            impl_->copyFrom(mat, usage_);
+            Mat mat = arr.getMat();
+            CV_Assert( mat.isContinuous() );
+            impl_->copyFrom(asize.area() * CV_ELEM_SIZE(atype), mat.data);
         }
     }
 #endif
 }
 
-void cv::GlBuffer::bind() const
+void cv::GlBuffer::copyTo(OutputArray arr, Target target, bool autoRelease) const
 {
 #ifndef HAVE_OPENGL
-    throw_nogl;
+    (void) arr;
+    (void) target;
+    (void) autoRelease;
+    throw_nogl();
 #else
-    impl_->bind(usage_);
+    const int kind = arr.kind();
+
+    switch (kind)
+    {
+    case _InputArray::OPENGL_BUFFER:
+        {
+            arr.getGlBufferRef().copyFrom(*this, target, autoRelease);
+            break;
+        }
+
+    case _InputArray::OPENGL_TEXTURE2D:
+        {
+            arr.getGlTexture2DRef().copyFrom(*this, autoRelease);
+            break;
+        }
+
+    case _InputArray::GPU_MAT:
+        {
+            #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+                throw_nocuda();
+            #else
+                GpuMat& dmat = arr.getGpuMatRef();
+                dmat.create(rows_, cols_, type_);
+                impl_->copyTo(dmat.data, dmat.step, dmat.cols * dmat.elemSize(), dmat.rows);
+            #endif
+
+            break;
+        }
+
+    default:
+        {
+            arr.create(rows_, cols_, type_);
+            Mat mat = arr.getMat();
+            CV_Assert( mat.isContinuous() );
+            impl_->copyTo(mat.rows * mat.cols * mat.elemSize(), mat.data);
+        }
+    }
 #endif
 }
 
-void cv::GlBuffer::unbind() const
+GlBuffer cv::GlBuffer::clone(Target target, bool autoRelease) const
 {
 #ifndef HAVE_OPENGL
-    throw_nogl;
+    (void) target;
+    (void) autoRelease;
+    throw_nogl();
+    return GlBuffer();
 #else
-    impl_->unbind(usage_);
+    GlBuffer buf;
+    buf.copyFrom(*this, target, autoRelease);
+    return buf;
 #endif
 }
 
-Mat cv::GlBuffer::mapHost()
+void cv::GlBuffer::bind(Target target) const
 {
 #ifndef HAVE_OPENGL
-    throw_nogl;
+    (void) target;
+    throw_nogl();
+#else
+    impl_->bind(target);
+#endif
+}
+
+void cv::GlBuffer::unbind(Target target)
+{
+#ifndef HAVE_OPENGL
+    (void) target;
+    throw_nogl();
+#else
+    gl::BindBuffer(target, 0);
+    CV_CheckGlError();
+#endif
+}
+
+Mat cv::GlBuffer::mapHost(Access access)
+{
+#ifndef HAVE_OPENGL
+    (void) access;
+    throw_nogl();
     return Mat();
 #else
-    return impl_->mapHost(rows_, cols_, type_, usage_);
+    return Mat(rows_, cols_, type_, impl_->mapHost(access));
 #endif
 }
 
 void cv::GlBuffer::unmapHost()
 {
 #ifndef HAVE_OPENGL
-    throw_nogl;
+    throw_nogl();
 #else
-    impl_->unmapHost(usage_);
+    return impl_->unmapHost();
 #endif
 }
 
 GpuMat cv::GlBuffer::mapDevice()
 {
 #ifndef HAVE_OPENGL
-    throw_nogl;
+    throw_nogl();
     return GpuMat();
 #else
     #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-        throw_nocuda;
+        throw_nocuda();
         return GpuMat();
     #else
-        return impl_->mapDevice(rows_, cols_, type_);
+        return GpuMat(rows_, cols_, type_, impl_->mapDevice());
     #endif
 #endif
 }
@@ -690,418 +793,443 @@ GpuMat cv::GlBuffer::mapDevice()
 void cv::GlBuffer::unmapDevice()
 {
 #ifndef HAVE_OPENGL
-    throw_nogl;
+    throw_nogl();
 #else
     #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-        throw_nocuda;
+        throw_nocuda();
     #else
         impl_->unmapDevice();
     #endif
 #endif
 }
 
+unsigned int cv::GlBuffer::bufId() const
+{
+#ifndef HAVE_OPENGL
+    throw_nogl();
+    return 0;
+#else
+    return impl_->bufId();
+#endif
+}
+
 template <> void cv::Ptr<cv::GlBuffer::Impl>::delete_obj()
 {
     if (obj) delete obj;
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////
-// GlTexture
+// GlTexture2D
 
 #ifndef HAVE_OPENGL
 
-class cv::GlTexture::Impl
+class cv::GlTexture2D::Impl
 {
 };
 
 #else
 
-class cv::GlTexture::Impl
+class cv::GlTexture2D::Impl
 {
 public:
     static const Ptr<Impl> empty();
 
-    Impl(int rows, int cols, int type);
-
-    Impl(const Mat& mat, bool bgra);
-    Impl(const GlBuffer& buf, bool bgra);
-
+    Impl(GLuint texId, bool autoRelease);
+    Impl(GLint internalFormat, GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid* pixels, bool autoRelease);
     ~Impl();
 
-    void copyFrom(const Mat& mat, bool bgra);
-    void copyFrom(const GlBuffer& buf, bool bgra);
+    void copyFrom(GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid *pixels);
+    void copyTo(GLenum format, GLenum type, GLvoid* pixels) const;
 
     void bind() const;
-    void unbind() const;
+
+    void setAutoRelease(bool flag) { autoRelease_ = flag; }
+
+    GLuint texId() const { return texId_; }
 
 private:
     Impl();
 
-    GLuint tex_;
+    GLuint texId_;
+    bool autoRelease_;
 };
 
-inline const Ptr<cv::GlTexture::Impl> cv::GlTexture::Impl::empty()
+const Ptr<cv::GlTexture2D::Impl> cv::GlTexture2D::Impl::empty()
 {
     static Ptr<Impl> p(new Impl);
     return p;
 }
 
-inline cv::GlTexture::Impl::Impl() : tex_(0)
+cv::GlTexture2D::Impl::Impl() : texId_(0), autoRelease_(true)
 {
 }
 
-cv::GlTexture::Impl::Impl(int rows, int cols, int type) : tex_(0)
+cv::GlTexture2D::Impl::Impl(GLuint atexId, bool autoRelease) : texId_(atexId), autoRelease_(autoRelease)
 {
-    if (!glFuncTab()->isGlContextInitialized())
-        throw_nogl;
+}
 
-    int depth = CV_MAT_DEPTH(type);
-    int cn = CV_MAT_CN(type);
-
-    CV_DbgAssert(rows > 0 && cols > 0);
-    CV_Assert(cn == 1 || cn == 3 || cn == 4);
-    CV_Assert(depth >= 0 && depth <= CV_32F);
-
-    glGenTextures(1, &tex_);
-    CV_CheckGlError();
-    CV_Assert(tex_ != 0);
-
-    glBindTexture(GL_TEXTURE_2D, tex_);
+cv::GlTexture2D::Impl::Impl(GLint internalFormat, GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid* pixels, bool autoRelease) : texId_(0), autoRelease_(autoRelease)
+{
+    gl::GenTextures(1, &texId_);
     CV_CheckGlError();
 
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    CV_Assert(texId_ != 0);
+
+    gl::BindTexture(gl::TEXTURE_2D, texId_);
     CV_CheckGlError();
 
-    GLenum format = cn == 1 ? GL_LUMINANCE : cn == 3 ? GL_BGR : GL_BGRA;
-
-    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+    gl::PixelStorei(gl::UNPACK_ALIGNMENT, 1);
     CV_CheckGlError();
 
-    glTexImage2D(GL_TEXTURE_2D, 0, cn, cols, rows, 0, format, gl_types[depth], 0);
+    gl::TexImage2D(gl::TEXTURE_2D, 0, internalFormat, width, height, 0, format, type, pixels);
+    CV_CheckGlError();
+
+    gl::GenerateMipmap(gl::TEXTURE_2D);
     CV_CheckGlError();
 }
 
-cv::GlTexture::Impl::Impl(const Mat& mat, bool bgra) : tex_(0)
+cv::GlTexture2D::Impl::~Impl()
 {
-    if (!glFuncTab()->isGlContextInitialized())
-        throw_nogl;
+    if (autoRelease_ && texId_)
+        gl::DeleteTextures(1, &texId_);
+}
 
-    int depth = mat.depth();
-    int cn = mat.channels();
-
-    CV_DbgAssert(mat.rows > 0 && mat.cols > 0);
-    CV_Assert(cn == 1 || cn == 3 || cn == 4);
-    CV_Assert(depth >= 0 && depth <= CV_32F);
-    CV_Assert(mat.isContinuous());
-
-    glGenTextures(1, &tex_);
-    CV_CheckGlError();
-    CV_Assert(tex_ != 0);
-
-    glBindTexture(GL_TEXTURE_2D, tex_);
+void cv::GlTexture2D::Impl::copyFrom(GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid *pixels)
+{
+    gl::BindTexture(gl::TEXTURE_2D, texId_);
     CV_CheckGlError();
 
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    gl::PixelStorei(gl::UNPACK_ALIGNMENT, 1);
     CV_CheckGlError();
 
-    GLenum format = cn == 1 ? GL_LUMINANCE : (cn == 3 ? (bgra ? GL_BGR : GL_RGB) : (bgra ? GL_BGRA : GL_RGBA));
-
-    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+    gl::TexSubImage2D(gl::TEXTURE_2D, 0, 0, 0, width, height, format, type, pixels);
     CV_CheckGlError();
 
-    glTexImage2D(GL_TEXTURE_2D, 0, cn, mat.cols, mat.rows, 0, format, gl_types[depth], mat.data);
+    gl::GenerateMipmap(gl::TEXTURE_2D);
     CV_CheckGlError();
 }
 
-cv::GlTexture::Impl::Impl(const GlBuffer& buf, bool bgra) : tex_(0)
+void cv::GlTexture2D::Impl::copyTo(GLenum format, GLenum type, GLvoid* pixels) const
 {
-    if (!glFuncTab()->isGlContextInitialized())
-        throw_nogl;
-
-    int depth = buf.depth();
-    int cn = buf.channels();
-
-    CV_DbgAssert(buf.rows() > 0 && buf.cols() > 0);
-    CV_Assert(cn == 1 || cn == 3 || cn == 4);
-    CV_Assert(depth >= 0 && depth <= CV_32F);
-    CV_Assert(buf.usage() == GlBuffer::TEXTURE_BUFFER);
-
-    glGenTextures(1, &tex_);
-    CV_CheckGlError();
-    CV_Assert(tex_ != 0);
-
-    glBindTexture(GL_TEXTURE_2D, tex_);
+    gl::BindTexture(gl::TEXTURE_2D, texId_);
     CV_CheckGlError();
 
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    gl::PixelStorei(gl::PACK_ALIGNMENT, 1);
     CV_CheckGlError();
 
-    GLenum format = cn == 1 ? GL_LUMINANCE : (cn == 3 ? (bgra ? GL_BGR : GL_RGB) : (bgra ? GL_BGRA : GL_RGBA));
-
-    buf.bind();
-
-    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
-    CV_CheckGlError();
-
-    glTexImage2D(GL_TEXTURE_2D, 0, cn, buf.cols(), buf.rows(), 0, format, gl_types[depth], 0);
-    CV_CheckGlError();
-
-    buf.unbind();
-}
-
-inline cv::GlTexture::Impl::~Impl()
-{
-    if (tex_)
-        glDeleteTextures(1, &tex_);
-}
-
-void cv::GlTexture::Impl::copyFrom(const Mat& mat, bool bgra)
-{
-    CV_Assert(tex_ != 0);
-
-    bind();
-
-    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
-    CV_CheckGlError();
-
-    int cn = mat.channels();
-    GLenum format = cn == 1 ? GL_LUMINANCE : (cn == 3 ? (bgra ? GL_BGR : GL_RGB) : (bgra ? GL_BGRA : GL_RGBA));
-
-    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, mat.cols, mat.rows, format, gl_types[mat.depth()], mat.data);
-    CV_CheckGlError();
-
-    unbind();
-}
-
-void cv::GlTexture::Impl::copyFrom(const GlBuffer& buf, bool bgra)
-{
-    CV_Assert(tex_ != 0);
-    CV_Assert(buf.usage() == GlBuffer::TEXTURE_BUFFER);
-
-    bind();
-
-    buf.bind();
-
-    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
-    CV_CheckGlError();
-
-    int cn = buf.channels();
-    GLenum format = cn == 1 ? GL_LUMINANCE : (cn == 3 ? (bgra ? GL_BGR : GL_RGB) : (bgra ? GL_BGRA : GL_RGBA));
-
-    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, buf.cols(), buf.rows(), format, gl_types[buf.depth()], 0);
-    CV_CheckGlError();
-
-    buf.unbind();
-
-    unbind();
-}
-
-inline void cv::GlTexture::Impl::bind() const
-{
-    CV_Assert(tex_ != 0);
-
-    glEnable(GL_TEXTURE_2D);
-    CV_CheckGlError();
-
-    glBindTexture(GL_TEXTURE_2D, tex_);
+    gl::GetTexImage(gl::TEXTURE_2D, 0, format, type, pixels);
     CV_CheckGlError();
 }
 
-inline void cv::GlTexture::Impl::unbind() const
+void cv::GlTexture2D::Impl::bind() const
 {
-    glBindTexture(GL_TEXTURE_2D, 0);
-
-    glDisable(GL_TEXTURE_2D);
+    gl::BindTexture(gl::TEXTURE_2D, texId_);
+    CV_CheckGlError();
 }
 
 #endif // HAVE_OPENGL
 
-cv::GlTexture::GlTexture() : rows_(0), cols_(0), type_(0), buf_(GlBuffer::TEXTURE_BUFFER)
+cv::GlTexture2D::GlTexture2D() : rows_(0), cols_(0), format_(NONE)
 {
 #ifndef HAVE_OPENGL
-    throw_nogl;
+    throw_nogl();
 #else
     impl_ = Impl::empty();
 #endif
 }
 
-cv::GlTexture::GlTexture(int _rows, int _cols, int _type) : rows_(0), cols_(0), type_(0), buf_(GlBuffer::TEXTURE_BUFFER)
+cv::GlTexture2D::GlTexture2D(int arows, int acols, Format aformat, unsigned int atexId, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
 {
 #ifndef HAVE_OPENGL
-    (void)_rows;
-    (void)_cols;
-    (void)_type;
-    throw_nogl;
+    (void) arows;
+    (void) acols;
+    (void) aformat;
+    (void) atexId;
+    (void) autoRelease;
+    throw_nogl();
 #else
-    impl_ = new Impl(_rows, _cols, _type);
-    rows_ = _rows;
-    cols_ = _cols;
-    type_ = _type;
+    impl_ = new Impl(atexId, autoRelease);
+    rows_ = arows;
+    cols_ = acols;
+    format_ = aformat;
 #endif
 }
 
-cv::GlTexture::GlTexture(Size _size, int _type) : rows_(0), cols_(0), type_(0), buf_(GlBuffer::TEXTURE_BUFFER)
+cv::GlTexture2D::GlTexture2D(Size asize, Format aformat, unsigned int atexId, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
 {
 #ifndef HAVE_OPENGL
-    (void)_size;
-    (void)_type;
-    throw_nogl;
+    (void) asize;
+    (void) aformat;
+    (void) atexId;
+    (void) autoRelease;
+    throw_nogl();
 #else
-    impl_ = new Impl(_size.height, _size.width, _type);
-    rows_ = _size.height;
-    cols_ = _size.width;
-    type_ = _type;
+    impl_ = new Impl(atexId, autoRelease);
+    rows_ = asize.height;
+    cols_ = asize.width;
+    format_ = aformat;
 #endif
 }
 
-cv::GlTexture::GlTexture(InputArray mat_, bool bgra) : rows_(0), cols_(0), type_(0), buf_(GlBuffer::TEXTURE_BUFFER)
+cv::GlTexture2D::GlTexture2D(int arows, int acols, Format aformat, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
+{
+    create(arows, acols, aformat, autoRelease);
+}
+
+cv::GlTexture2D::GlTexture2D(Size asize, Format aformat, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
+{
+    create(asize, aformat, autoRelease);
+}
+
+cv::GlTexture2D::GlTexture2D(InputArray arr, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
 {
 #ifndef HAVE_OPENGL
-    (void)mat_;
-    (void)bgra;
-    throw_nogl;
+    (void) arr;
+    (void) autoRelease;
+    throw_nogl();
 #else
-    int kind = mat_.kind();
-    Size _size = mat_.size();
-    int _type = mat_.type();
+    const int kind = arr.kind();
+
+    const Size asize = arr.size();
+    const int atype = arr.type();
+
+    const int depth = CV_MAT_DEPTH(atype);
+    const int cn = CV_MAT_CN(atype);
+
+    CV_Assert( depth <= CV_32F );
+    CV_Assert( cn == 1 || cn == 3 || cn == 4 );
+
+    const Format internalFormats[] =
+    {
+        NONE, DEPTH_COMPONENT, NONE, RGB, RGBA
+    };
+    const GLenum srcFormats[] =
+    {
+        0, gl::DEPTH_COMPONENT, 0, gl::BGR, gl::BGRA
+    };
 
     switch (kind)
     {
     case _InputArray::OPENGL_BUFFER:
         {
-            GlBuffer buf = mat_.getGlBuffer();
-            impl_ = new Impl(buf, bgra);
+            GlBuffer buf = arr.getGlBuffer();
+            buf.bind(GlBuffer::PIXEL_UNPACK_BUFFER);
+            impl_ = new Impl(internalFormats[cn], asize.width, asize.height, srcFormats[cn], gl_types[depth], 0, autoRelease);
+            GlBuffer::unbind(GlBuffer::PIXEL_UNPACK_BUFFER);
             break;
         }
+
     case _InputArray::GPU_MAT:
         {
             #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-                throw_nocuda;
+                throw_nocuda();
             #else
-                GpuMat d_mat = mat_.getGpuMat();
-                GlBuffer buf(d_mat, GlBuffer::TEXTURE_BUFFER);
-                impl_ = new Impl(buf, bgra);
+                GpuMat dmat = arr.getGpuMat();
+                GlBuffer buf(dmat, GlBuffer::PIXEL_UNPACK_BUFFER);
+                buf.bind(GlBuffer::PIXEL_UNPACK_BUFFER);
+                impl_ = new Impl(internalFormats[cn], asize.width, asize.height, srcFormats[cn], gl_types[depth], 0, autoRelease);
+                GlBuffer::unbind(GlBuffer::PIXEL_UNPACK_BUFFER);
             #endif
 
             break;
         }
+
     default:
         {
-            Mat mat = mat_.getMat();
-            impl_ = new Impl(mat, bgra);
+            Mat mat = arr.getMat();
+            CV_Assert( mat.isContinuous() );
+            GlBuffer::unbind(GlBuffer::PIXEL_UNPACK_BUFFER);
+            impl_ = new Impl(internalFormats[cn], asize.width, asize.height, srcFormats[cn], gl_types[depth], mat.data, autoRelease);
             break;
         }
     }
 
-    rows_ = _size.height;
-    cols_ = _size.width;
-    type_ = _type;
+    rows_ = asize.height;
+    cols_ = asize.width;
+    format_ = internalFormats[cn];
 #endif
 }
 
-void cv::GlTexture::create(int _rows, int _cols, int _type)
+void cv::GlTexture2D::create(int arows, int acols, Format aformat, bool autoRelease)
 {
 #ifndef HAVE_OPENGL
-    (void)_rows;
-    (void)_cols;
-    (void)_type;
-    throw_nogl;
+    (void) arows;
+    (void) acols;
+    (void) aformat;
+    (void) autoRelease;
+    throw_nogl();
 #else
-    if (rows_ != _rows || cols_ != _cols || type_ != _type)
+    if (rows_ != arows || cols_ != acols || format_ != aformat)
     {
-        impl_ = new Impl(_rows, _cols, _type);
-        rows_ = _rows;
-        cols_ = _cols;
-        type_ = _type;
+        GlBuffer::unbind(GlBuffer::PIXEL_UNPACK_BUFFER);
+        impl_ = new Impl(aformat, acols, arows, aformat, gl::FLOAT, 0, autoRelease);
+        rows_ = arows;
+        cols_ = acols;
+        format_ = aformat;
     }
 #endif
 }
 
-void cv::GlTexture::release()
+void cv::GlTexture2D::release()
 {
-#ifndef HAVE_OPENGL
-    throw_nogl;
-#else
+#ifdef HAVE_OPENGL
+    if (*impl_.refcount == 1)
+        impl_->setAutoRelease(true);
     impl_ = Impl::empty();
+    rows_ = 0;
+    cols_ = 0;
+    format_ = NONE;
 #endif
 }
 
-void cv::GlTexture::copyFrom(InputArray mat_, bool bgra)
+void cv::GlTexture2D::setAutoRelease(bool flag)
 {
 #ifndef HAVE_OPENGL
-    (void)mat_;
-    (void)bgra;
-    throw_nogl;
+    (void) flag;
+    throw_nogl();
 #else
-    int kind = mat_.kind();
-    Size _size = mat_.size();
-    int _type = mat_.type();
+    impl_->setAutoRelease(flag);
+#endif
+}
 
-    create(_size, _type);
+void cv::GlTexture2D::copyFrom(InputArray arr, bool autoRelease)
+{
+#ifndef HAVE_OPENGL
+    (void) arr;
+    (void) autoRelease;
+    throw_nogl();
+#else
+    const int kind = arr.kind();
+
+    const Size asize = arr.size();
+    const int atype = arr.type();
+
+    const int depth = CV_MAT_DEPTH(atype);
+    const int cn = CV_MAT_CN(atype);
+
+    CV_Assert( depth <= CV_32F );
+    CV_Assert( cn == 1 || cn == 3 || cn == 4 );
+
+    const Format internalFormats[] =
+    {
+        NONE, DEPTH_COMPONENT, NONE, RGB, RGBA
+    };
+    const GLenum srcFormats[] =
+    {
+        0, gl::DEPTH_COMPONENT, 0, gl::BGR, gl::BGRA
+    };
+
+    create(asize, internalFormats[cn], autoRelease);
 
     switch(kind)
     {
-    case _InputArray::OPENGL_TEXTURE:
-        {
-            GlTexture tex = mat_.getGlTexture();
-            *this = tex;
-            break;
-        }
     case _InputArray::OPENGL_BUFFER:
         {
-            GlBuffer buf = mat_.getGlBuffer();
-            impl_->copyFrom(buf, bgra);
+            GlBuffer buf = arr.getGlBuffer();
+            buf.bind(GlBuffer::PIXEL_UNPACK_BUFFER);
+            impl_->copyFrom(asize.width, asize.height, srcFormats[cn], gl_types[depth], 0);
+            GlBuffer::unbind(GlBuffer::PIXEL_UNPACK_BUFFER);
             break;
         }
+
     case _InputArray::GPU_MAT:
         {
             #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-                throw_nocuda;
+                throw_nocuda();
             #else
-                GpuMat d_mat = mat_.getGpuMat();
-                buf_.copyFrom(d_mat);
-                impl_->copyFrom(buf_, bgra);
+                GpuMat dmat = arr.getGpuMat();
+                GlBuffer buf(dmat, GlBuffer::PIXEL_UNPACK_BUFFER);
+                buf.bind(GlBuffer::PIXEL_UNPACK_BUFFER);
+                impl_->copyFrom(asize.width, asize.height, srcFormats[cn], gl_types[depth], 0);
+                GlBuffer::unbind(GlBuffer::PIXEL_UNPACK_BUFFER);
             #endif
 
             break;
         }
+
     default:
         {
-            Mat mat = mat_.getMat();
-            impl_->copyFrom(mat, bgra);
+            Mat mat = arr.getMat();
+            CV_Assert( mat.isContinuous() );
+            GlBuffer::unbind(GlBuffer::PIXEL_UNPACK_BUFFER);
+            impl_->copyFrom(asize.width, asize.height, srcFormats[cn], gl_types[depth], mat.data);
         }
     }
 #endif
 }
 
-void cv::GlTexture::bind() const
+void cv::GlTexture2D::copyTo(OutputArray arr, int ddepth, bool autoRelease) const
 {
 #ifndef HAVE_OPENGL
-    throw_nogl;
+    (void) arr;
+    (void) ddepth;
+    (void) autoRelease;
+    throw_nogl();
+#else
+    const int kind = arr.kind();
+
+    const int cn = format_ == DEPTH_COMPONENT ? 1: format_ == RGB ? 3 : 4;
+    const GLenum dstFormat = format_ == DEPTH_COMPONENT ? gl::DEPTH_COMPONENT : format_ == RGB ? gl::BGR : gl::BGRA;
+
+    switch(kind)
+    {
+    case _InputArray::OPENGL_BUFFER:
+        {
+            GlBuffer& buf = arr.getGlBufferRef();
+            buf.create(rows_, cols_, CV_MAKE_TYPE(ddepth, cn), GlBuffer::PIXEL_PACK_BUFFER, autoRelease);
+            buf.bind(GlBuffer::PIXEL_PACK_BUFFER);
+            impl_->copyTo(dstFormat, gl_types[ddepth], 0);
+            GlBuffer::unbind(GlBuffer::PIXEL_PACK_BUFFER);
+            break;
+        }
+
+    case _InputArray::GPU_MAT:
+        {
+            #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+                throw_nocuda();
+            #else
+                GlBuffer buf(rows_, cols_, CV_MAKE_TYPE(ddepth, cn), GlBuffer::PIXEL_PACK_BUFFER);
+                buf.bind(GlBuffer::PIXEL_PACK_BUFFER);
+                impl_->copyTo(dstFormat, gl_types[ddepth], 0);
+                GlBuffer::unbind(GlBuffer::PIXEL_PACK_BUFFER);
+                buf.copyTo(arr);
+            #endif
+
+            break;
+        }
+
+    default:
+        {
+            arr.create(rows_, cols_, CV_MAKE_TYPE(ddepth, cn));
+            Mat mat = arr.getMat();
+            CV_Assert( mat.isContinuous() );
+            GlBuffer::unbind(GlBuffer::PIXEL_PACK_BUFFER);
+            impl_->copyTo(dstFormat, gl_types[ddepth], mat.data);
+        }
+    }
+#endif
+}
+
+void cv::GlTexture2D::bind() const
+{
+#ifndef HAVE_OPENGL
+    throw_nogl();
 #else
     impl_->bind();
 #endif
 }
 
-void cv::GlTexture::unbind() const
+unsigned int cv::GlTexture2D::texId() const
 {
 #ifndef HAVE_OPENGL
-    throw_nogl;
+    throw_nogl();
+    return 0;
 #else
-    impl_->unbind();
+    return impl_->texId();
 #endif
 }
 
-template <> void cv::Ptr<cv::GlTexture::Impl>::delete_obj()
+template <> void cv::Ptr<cv::GlTexture2D::Impl>::delete_obj()
 {
     if (obj) delete obj;
 }
@@ -1109,266 +1237,253 @@ template <> void cv::Ptr<cv::GlTexture::Impl>::delete_obj()
 ////////////////////////////////////////////////////////////////////////
 // GlArrays
 
-void cv::GlArrays::setVertexArray(InputArray vertex)
+cv::GlArrays::GlArrays() : size_(0)
 {
-    int cn = vertex.channels();
-    int depth = vertex.depth();
-
-    CV_Assert(cn == 2 || cn == 3 || cn == 4);
-    CV_Assert(depth == CV_16S || depth == CV_32S || depth == CV_32F || depth == CV_64F);
-
-    vertex_.copyFrom(vertex);
 }
 
-void cv::GlArrays::setColorArray(InputArray color, bool bgra)
+void cv::GlArrays::setVertexArray(InputArray vertex)
 {
-    int cn = color.channels();
+    const int cn = vertex.channels();
+    const int depth = vertex.depth();
 
-    CV_Assert((cn == 3 && !bgra) || cn == 4);
+    CV_Assert( cn == 2 || cn == 3 || cn == 4 );
+    CV_Assert( depth == CV_16S || depth == CV_32S || depth == CV_32F || depth == CV_64F );
 
-    color_.copyFrom(color);
-    bgra_ = bgra;
+    if (vertex.kind() == _InputArray::OPENGL_BUFFER)
+        vertex_ = vertex.getGlBuffer();
+    else
+        vertex_.copyFrom(vertex);
+
+    size_ = vertex_.size().area();
+}
+
+void cv::GlArrays::resetVertexArray()
+{
+    vertex_.release();
+    size_ = 0;
+}
+
+void cv::GlArrays::setColorArray(InputArray color)
+{
+    const int cn = color.channels();
+
+    CV_Assert( cn == 3 || cn == 4 );
+
+    if (color.kind() == _InputArray::OPENGL_BUFFER)
+        color_ = color.getGlBuffer();
+    else
+        color_.copyFrom(color);
+}
+
+void cv::GlArrays::resetColorArray()
+{
+    color_.release();
 }
 
 void cv::GlArrays::setNormalArray(InputArray normal)
 {
-    int cn = normal.channels();
-    int depth = normal.depth();
+    const int cn = normal.channels();
+    const int depth = normal.depth();
 
-    CV_Assert(cn == 3);
-    CV_Assert(depth == CV_8S || depth == CV_16S || depth == CV_32S || depth == CV_32F || depth == CV_64F);
+    CV_Assert( cn == 3 );
+    CV_Assert( depth == CV_8S || depth == CV_16S || depth == CV_32S || depth == CV_32F || depth == CV_64F );
 
-    normal_.copyFrom(normal);
+    if (normal.kind() == _InputArray::OPENGL_BUFFER)
+        normal_ = normal.getGlBuffer();
+    else
+        normal_.copyFrom(normal);
+}
+
+void cv::GlArrays::resetNormalArray()
+{
+    normal_.release();
 }
 
 void cv::GlArrays::setTexCoordArray(InputArray texCoord)
 {
-    int cn = texCoord.channels();
-    int depth = texCoord.depth();
+    const int cn = texCoord.channels();
+    const int depth = texCoord.depth();
 
-    CV_Assert(cn >= 1 && cn <= 4);
-    CV_Assert(depth == CV_16S || depth == CV_32S || depth == CV_32F || depth == CV_64F);
+    CV_Assert( cn >= 1 && cn <= 4 );
+    CV_Assert( depth == CV_16S || depth == CV_32S || depth == CV_32F || depth == CV_64F );
 
-    texCoord_.copyFrom(texCoord);
+    if (texCoord.kind() == _InputArray::OPENGL_BUFFER)
+        texCoord_ = texCoord.getGlBuffer();
+    else
+        texCoord_.copyFrom(texCoord);
+}
+
+void cv::GlArrays::resetTexCoordArray()
+{
+    texCoord_.release();
+}
+
+void cv::GlArrays::release()
+{
+    resetVertexArray();
+    resetColorArray();
+    resetNormalArray();
+    resetTexCoordArray();
+}
+
+void cv::GlArrays::setAutoRelease(bool flag)
+{
+    vertex_.setAutoRelease(flag);
+    color_.setAutoRelease(flag);
+    normal_.setAutoRelease(flag);
+    texCoord_.setAutoRelease(flag);
 }
 
 void cv::GlArrays::bind() const
 {
 #ifndef HAVE_OPENGL
-    throw_nogl;
+    throw_nogl();
 #else
-    CV_DbgAssert(texCoord_.empty() || texCoord_.size().area() == vertex_.size().area());
-    CV_DbgAssert(normal_.empty() || normal_.size().area() == vertex_.size().area());
-    CV_DbgAssert(color_.empty() || color_.size().area() == vertex_.size().area());
+    CV_Assert( texCoord_.empty() || texCoord_.size().area() == size_ );
+    CV_Assert( normal_.empty() || normal_.size().area() == size_ );
+    CV_Assert( color_.empty() || color_.size().area() == size_ );
 
-    if (!texCoord_.empty())
+    if (texCoord_.empty())
     {
-        glEnableClientState(GL_TEXTURE_COORD_ARRAY);
+        gl::DisableClientState(gl::TEXTURE_COORD_ARRAY);
         CV_CheckGlError();
-
-        texCoord_.bind();
-
-        glTexCoordPointer(texCoord_.channels(), gl_types[texCoord_.depth()], 0, 0);
-        CV_CheckGlError();
-
-        texCoord_.unbind();
     }
-
-    if (!normal_.empty())
+    else
     {
-        glEnableClientState(GL_NORMAL_ARRAY);
+        gl::EnableClientState(gl::TEXTURE_COORD_ARRAY);
         CV_CheckGlError();
 
-        normal_.bind();
+        texCoord_.bind(GlBuffer::ARRAY_BUFFER);
 
-        glNormalPointer(gl_types[normal_.depth()], 0, 0);
-        CV_CheckGlError();
-
-        normal_.unbind();
-    }
-
-    if (!color_.empty())
-    {
-        glEnableClientState(GL_COLOR_ARRAY);
-        CV_CheckGlError();
-
-        color_.bind();
-
-        int cn = color_.channels();
-        int format = cn == 3 ? cn : (bgra_ ? GL_BGRA : 4);
-
-        glColorPointer(format, gl_types[color_.depth()], 0, 0);
-        CV_CheckGlError();
-
-        color_.unbind();
-    }
-
-    if (!vertex_.empty())
-    {
-        glEnableClientState(GL_VERTEX_ARRAY);
-        CV_CheckGlError();
-
-        vertex_.bind();
-
-        glVertexPointer(vertex_.channels(), gl_types[vertex_.depth()], 0, 0);
-        CV_CheckGlError();
-
-        vertex_.unbind();
-    }
-#endif
-}
-
-void cv::GlArrays::unbind() const
-{
-#ifndef HAVE_OPENGL
-    throw_nogl;
-#else
-    if (!texCoord_.empty())
-    {
-        glDisableClientState(GL_TEXTURE_COORD_ARRAY);
+        gl::TexCoordPointer(texCoord_.channels(), gl_types[texCoord_.depth()], 0, 0);
         CV_CheckGlError();
     }
 
-    if (!normal_.empty())
+    if (normal_.empty())
     {
-        glDisableClientState(GL_NORMAL_ARRAY);
+        gl::DisableClientState(gl::NORMAL_ARRAY);
+        CV_CheckGlError();
+    }
+    else
+    {
+        gl::EnableClientState(gl::NORMAL_ARRAY);
+        CV_CheckGlError();
+
+        normal_.bind(GlBuffer::ARRAY_BUFFER);
+
+        gl::NormalPointer(gl_types[normal_.depth()], 0, 0);
         CV_CheckGlError();
     }
 
-    if (!color_.empty())
+    if (color_.empty())
     {
-        glDisableClientState(GL_COLOR_ARRAY);
+        gl::DisableClientState(gl::COLOR_ARRAY);
+        CV_CheckGlError();
+    }
+    else
+    {
+        gl::EnableClientState(gl::COLOR_ARRAY);
+        CV_CheckGlError();
+
+        color_.bind(GlBuffer::ARRAY_BUFFER);
+
+        const int cn = color_.channels();
+
+        gl::ColorPointer(cn, gl_types[color_.depth()], 0, 0);
         CV_CheckGlError();
     }
 
-    if (!vertex_.empty())
+    if (vertex_.empty())
     {
-        glDisableClientState(GL_VERTEX_ARRAY);
+        gl::DisableClientState(gl::VERTEX_ARRAY);
         CV_CheckGlError();
     }
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////
-// GlFont
-
-cv::GlFont::GlFont(const string& _family, int _height, Weight _weight, Style _style)
-    : family_(_family), height_(_height), weight_(_weight), style_(_style), base_(0)
-{
-#ifndef HAVE_OPENGL
-    throw_nogl;
-#else
-    base_ = glGenLists(256);
-    CV_CheckGlError();
-
-    glFuncTab()->generateBitmapFont(family_, height_, weight_, (style_ & STYLE_ITALIC) != 0, (style_ & STYLE_UNDERLINE) != 0, 0, 256, base_);
-#endif
-}
-
-void cv::GlFont::draw(const char* str, size_t len) const
-{
-#ifndef HAVE_OPENGL
-    (void)str;
-    (void)len;
-    throw_nogl;
-#else
-    if (base_ && len > 0)
+    else
     {
-        glPushAttrib(GL_LIST_BIT);
-        glListBase(base_);
+        gl::EnableClientState(gl::VERTEX_ARRAY);
+        CV_CheckGlError();
 
-        glCallLists(static_cast<GLsizei>(len), GL_UNSIGNED_BYTE, str);
-
-        glPopAttrib();
+        vertex_.bind(GlBuffer::ARRAY_BUFFER);
 
+        gl::VertexPointer(vertex_.channels(), gl_types[vertex_.depth()], 0, 0);
         CV_CheckGlError();
     }
-#endif
-}
 
-namespace
-{
-    class FontCompare : public unary_function<Ptr<GlFont>, bool>
-    {
-    public:
-        inline FontCompare(const string& family, int height, GlFont::Weight weight, GlFont::Style style)
-            : family_(family), height_(height), weight_(weight), style_(style)
-        {
-        }
-
-        bool operator ()(const cv::Ptr<GlFont>& font)
-        {
-            return font->family() == family_ && font->height() == height_ && font->weight() == weight_ && font->style() == style_;
-        }
-
-    private:
-        string family_;
-        int height_;
-        GlFont::Weight weight_;
-        GlFont::Style style_;
-    };
-}
-
-Ptr<GlFont> cv::GlFont::get(const std::string& family, int height, Weight weight, Style style)
-{
-#ifndef HAVE_OPENGL
-    (void)family;
-    (void)height;
-    (void)weight;
-    (void)style;
-    throw_nogl;
-    return Ptr<GlFont>();
-#else
-    static vector< Ptr<GlFont> > fonts;
-    fonts.reserve(10);
-
-    vector< Ptr<GlFont> >::iterator fontIt = find_if(fonts.begin(), fonts.end(), FontCompare(family, height, weight, style));
-
-    if (fontIt == fonts.end())
-    {
-        fonts.push_back(new GlFont(family, height, weight, style));
-
-        fontIt = fonts.end() - 1;
-    }
-
-    return *fontIt;
+    GlBuffer::unbind(GlBuffer::ARRAY_BUFFER);
 #endif
 }
 
 ////////////////////////////////////////////////////////////////////////
 // Rendering
 
-void cv::render(const GlTexture& tex, Rect_<double> wndRect, Rect_<double> texRect)
+void cv::render(const GlTexture2D& tex, Rect_<double> wndRect, Rect_<double> texRect)
 {
 #ifndef HAVE_OPENGL
-    (void)tex;
-    (void)wndRect;
-    (void)texRect;
-    throw_nogl;
+    (void) tex;
+    (void) wndRect;
+    (void) texRect;
+    throw_nogl();
 #else
     if (!tex.empty())
     {
-        tex.bind();
-
-        glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
-
-        glBegin(GL_QUADS);
-            glTexCoord2d(texRect.x, texRect.y);
-            glVertex2d(wndRect.x, wndRect.y);
-
-            glTexCoord2d(texRect.x, texRect.y + texRect.height);
-            glVertex2d(wndRect.x, (wndRect.y + wndRect.height));
-
-            glTexCoord2d(texRect.x + texRect.width, texRect.y + texRect.height);
-            glVertex2d(wndRect.x + wndRect.width, (wndRect.y + wndRect.height));
-
-            glTexCoord2d(texRect.x + texRect.width, texRect.y);
-            glVertex2d(wndRect.x + wndRect.width, wndRect.y);
-        glEnd();
-
+        gl::MatrixMode(gl::PROJECTION);
+        gl::LoadIdentity();
+        gl::Ortho(0.0, 1.0, 1.0, 0.0, -1.0, 1.0);
         CV_CheckGlError();
 
-        tex.unbind();
+        gl::MatrixMode(gl::MODELVIEW);
+        gl::LoadIdentity();
+        CV_CheckGlError();
+
+        gl::Disable(gl::LIGHTING);
+        CV_CheckGlError();
+
+        tex.bind();
+
+        gl::Enable(gl::TEXTURE_2D);
+        CV_CheckGlError();
+
+        gl::TexEnvi(gl::TEXTURE_ENV, gl::TEXTURE_ENV_MODE, gl::REPLACE);
+        CV_CheckGlError();
+
+        gl::TexParameteri(gl::TEXTURE_2D, gl::TEXTURE_MIN_FILTER, gl::LINEAR);
+        CV_CheckGlError();
+
+        const float vertex[] =
+        {
+            wndRect.x, wndRect.y, 0.0f,
+            wndRect.x, (wndRect.y + wndRect.height), 0.0f,
+            wndRect.x + wndRect.width, (wndRect.y + wndRect.height), 0.0f,
+            wndRect.x + wndRect.width, wndRect.y, 0.0f
+        };
+        const float texCoords[] =
+        {
+            texRect.x, texRect.y,
+            texRect.x, texRect.y + texRect.height,
+            texRect.x + texRect.width, texRect.y + texRect.height,
+            texRect.x + texRect.width, texRect.y
+        };
+
+        GlBuffer::unbind(GlBuffer::ARRAY_BUFFER);
+
+        gl::EnableClientState(gl::TEXTURE_COORD_ARRAY);
+        CV_CheckGlError();
+
+        gl::TexCoordPointer(2, gl::FLOAT, 0, texCoords);
+        CV_CheckGlError();
+
+        gl::DisableClientState(gl::NORMAL_ARRAY);
+        gl::DisableClientState(gl::COLOR_ARRAY);
+        CV_CheckGlError();
+
+        gl::EnableClientState(gl::VERTEX_ARRAY);
+        CV_CheckGlError();
+
+        gl::VertexPointer(3, gl::FLOAT, 0, vertex);
+        CV_CheckGlError();
+
+        gl::DrawArrays(cv::RenderMode::QUADS, 0, 4);
+        CV_CheckGlError();
     }
 #endif
 }
@@ -1376,222 +1491,90 @@ void cv::render(const GlTexture& tex, Rect_<double> wndRect, Rect_<double> texRe
 void cv::render(const GlArrays& arr, int mode, Scalar color)
 {
 #ifndef HAVE_OPENGL
-    (void)arr;
-    (void)mode;
-    (void)color;
-    throw_nogl;
+    (void) arr;
+    (void) mode;
+    (void) color;
+    throw_nogl();
 #else
-    glColor3d(color[0] / 255.0, color[1] / 255.0, color[2] / 255.0);
-
-    arr.bind();
-
-    glDrawArrays(mode, 0, arr.size().area());
-
-    arr.unbind();
-#endif
-}
-
-void cv::render(const string& str, const Ptr<GlFont>& font, Scalar color, Point2d pos)
-{
-#ifndef HAVE_OPENGL
-    (void)str;
-    (void)font;
-    (void)color;
-    (void)pos;
-    throw_nogl;
-#else
-    glPushAttrib(GL_DEPTH_BUFFER_BIT);
-
-    GLint viewport[4];
-    glGetIntegerv(GL_VIEWPORT, viewport);
-
-    glDisable(GL_DEPTH_TEST);
-
-    glMatrixMode(GL_PROJECTION);
-    glLoadIdentity();
-
-    glMatrixMode(GL_MODELVIEW);
-    glLoadIdentity();
-
-    glColor3d(color[0] / 255.0, color[1] / 255.0, color[2] / 255.0);
-
-    glRasterPos2d(2.0 * (viewport[0] + pos.x) / viewport[2] - 1.0, 1.0 - 2.0 * (viewport[1] + pos.y + font->height()) / viewport[3]);
-
-    font->draw(str.c_str(), str.length());
-
-    glPopAttrib();
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////
-// GlCamera
-
-cv::GlCamera::GlCamera() :
-    eye_(0.0, 0.0, -5.0), center_(0.0, 0.0, 0.0), up_(0.0, 1.0, 0.0),
-    pos_(0.0, 0.0, -5.0), yaw_(0.0), pitch_(0.0), roll_(0.0),
-    useLookAtParams_(false),
-
-    scale_(1.0, 1.0, 1.0),
-
-    projectionMatrix_(),
-    fov_(45.0), aspect_(0.0),
-    left_(0.0), right_(1.0), bottom_(1.0), top_(0.0),
-    zNear_(-1.0), zFar_(1.0),
-    perspectiveProjection_(false)
-{
-}
-
-void cv::GlCamera::lookAt(Point3d eye, Point3d center, Point3d up)
-{
-    eye_ = eye;
-    center_ = center;
-    up_ = up;
-    useLookAtParams_ = true;
-}
-
-void cv::GlCamera::setCameraPos(Point3d pos, double yaw, double pitch, double roll)
-{
-    pos_ = pos;
-    yaw_ = yaw;
-    pitch_ = pitch;
-    roll_ = roll;
-    useLookAtParams_ = false;
-}
-
-void cv::GlCamera::setScale(Point3d scale)
-{
-    scale_ = scale;
-}
-
-void cv::GlCamera::setProjectionMatrix(const Mat& projectionMatrix, bool transpose)
-{
-    CV_Assert(projectionMatrix.type() == CV_32F || projectionMatrix.type() == CV_64F);
-    CV_Assert(projectionMatrix.cols == 4 && projectionMatrix.rows == 4);
-
-    projectionMatrix_ = transpose ? projectionMatrix.t() : projectionMatrix;
-}
-
-void cv::GlCamera::setPerspectiveProjection(double fov, double aspect, double zNear, double zFar)
-{
-    fov_ = fov;
-    aspect_ = aspect;
-    zNear_ = zNear;
-    zFar_ = zFar;
-
-    projectionMatrix_.release();
-    perspectiveProjection_ = true;
-}
-
-void cv::GlCamera::setOrthoProjection(double left, double right, double bottom, double top, double zNear, double zFar)
-{
-    left_ = left;
-    right_ = right;
-    bottom_ = bottom;
-    top_ = top;
-    zNear_ = zNear;
-    zFar_ = zFar;
-
-    projectionMatrix_.release();
-    perspectiveProjection_ = false;
-}
-
-void cv::GlCamera::setupProjectionMatrix() const
-{
-#ifndef HAVE_OPENGL
-    throw_nogl;
-#else
-    glMatrixMode(GL_PROJECTION);
-    glLoadIdentity();
-
-    if (projectionMatrix_.empty())
+    if (!arr.empty())
     {
-        if (perspectiveProjection_)
-            gluPerspective(fov_, aspect_, zNear_, zFar_);
-        else
-            glOrtho(left_, right_, bottom_, top_, zNear_, zFar_);
+        gl::Color3d(color[0] / 255.0, color[1] / 255.0, color[2] / 255.0);
+
+        arr.bind();
+
+        gl::DrawArrays(mode, 0, arr.size());
     }
-    else
-    {
-        if (projectionMatrix_.type() == CV_32F)
-            glLoadMatrixf(projectionMatrix_.ptr<float>());
-        else
-            glLoadMatrixd(projectionMatrix_.ptr<double>());
-    }
-
-    CV_CheckGlError();
 #endif
 }
 
-void cv::GlCamera::setupModelViewMatrix() const
+void cv::render(const GlArrays& arr, InputArray indices, int mode, Scalar color)
 {
 #ifndef HAVE_OPENGL
-    throw_nogl;
+    (void) arr;
+    (void) indices;
+    (void) mode;
+    (void) color;
+    throw_nogl();
 #else
-    glMatrixMode(GL_MODELVIEW);
-    glLoadIdentity();
-
-    if (useLookAtParams_)
-        gluLookAt(eye_.x, eye_.y, eye_.z, center_.x, center_.y, center_.z, up_.x, up_.y, up_.z);
-    else
+    if (!arr.empty() && !indices.empty())
     {
-        glRotated(-yaw_, 0.0, 1.0, 0.0);
-        glRotated(-pitch_, 1.0, 0.0, 0.0);
-        glRotated(-roll_, 0.0, 0.0, 1.0);
-        glTranslated(-pos_.x, -pos_.y, -pos_.z);
-    }
+        gl::Color3d(color[0] / 255.0, color[1] / 255.0, color[2] / 255.0);
 
-    glScaled(scale_.x, scale_.y, scale_.z);
+        arr.bind();
 
-    CV_CheckGlError();
-#endif
-}
+        const int kind = indices.kind();
 
-////////////////////////////////////////////////////////////////////////
-// Error handling
-
-bool icvCheckGlError(const char* file, const int line, const char* func)
-{
-#ifndef HAVE_OPENGL
-    (void)file;
-    (void)line;
-    (void)func;
-    return true;
-#else
-    GLenum err = glGetError();
-
-    if (err != GL_NO_ERROR)
-    {
-        const char* msg;
-
-        switch (err)
+        switch (kind)
         {
-        case GL_INVALID_ENUM:
-            msg = "An unacceptable value is specified for an enumerated argument";
-            break;
-        case GL_INVALID_VALUE:
-            msg = "A numeric argument is out of range";
-            break;
-        case GL_INVALID_OPERATION:
-            msg = "The specified operation is not allowed in the current state";
-            break;
-        case GL_STACK_OVERFLOW:
-            msg = "This command would cause a stack overflow";
-            break;
-        case GL_STACK_UNDERFLOW:
-            msg = "This command would cause a stack underflow";
-            break;
-        case GL_OUT_OF_MEMORY:
-            msg = "There is not enough memory left to execute the command";
-            break;
+        case _InputArray::OPENGL_BUFFER :
+            {
+                GlBuffer buf = indices.getGlBuffer();
+
+                const int depth = buf.depth();
+
+                CV_Assert( buf.channels() == 1 );
+                CV_Assert( depth <= CV_32S );
+
+                GLenum type;
+                if (depth < CV_16U)
+                    type = gl::UNSIGNED_BYTE;
+                else if (depth < CV_32S)
+                    type = gl::UNSIGNED_SHORT;
+                else
+                    type = gl::UNSIGNED_INT;
+
+                buf.bind(GlBuffer::ELEMENT_ARRAY_BUFFER);
+
+                gl::DrawElements(mode, buf.size().area(), type, 0);
+
+                GlBuffer::unbind(GlBuffer::ELEMENT_ARRAY_BUFFER);
+
+                break;
+            }
+
         default:
-            msg = "Unknown error";
-        };
+            {
+                Mat mat = indices.getMat();
 
-        cvError(CV_OpenGlApiCallError, func, msg, file, line);
+                const int depth = mat.depth();
 
-        return false;
+                CV_Assert( mat.channels() == 1 );
+                CV_Assert( depth <= CV_32S );
+                CV_Assert( mat.isContinuous() );
+
+                GLenum type;
+                if (depth < CV_16U)
+                    type = gl::UNSIGNED_BYTE;
+                else if (depth < CV_32S)
+                    type = gl::UNSIGNED_SHORT;
+                else
+                    type = gl::UNSIGNED_INT;
+
+                GlBuffer::unbind(GlBuffer::ELEMENT_ARRAY_BUFFER);
+
+                gl::DrawElements(mode, mat.size().area(), type, mat.data);
+            }
+        }
     }
-
-    return true;
 #endif
 }
diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index 78aafcf928..2f62826dd5 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -22,17 +22,14 @@ source_group("Device"         FILES ${lib_device_hdrs})
 source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})
 
 if (HAVE_CUDA)
-  file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")
+  file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp" "src/nvidia/*.h*")
   file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
-  file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h")
-  set(ncv_files ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
+  set(ncv_files ${ncv_srcs} ${ncv_cuda})
 
   source_group("Src\\NVidia" FILES ${ncv_files})
   ocv_include_directories("src/nvidia" "src/nvidia/core" "src/nvidia/NPP_staging" ${CUDA_INCLUDE_DIRS})
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations /wd4211 /wd4201 /wd4100 /wd4505 /wd4408)
   string(REPLACE "-Wsign-promo" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-
-  #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
   #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
 
   if(MSVC)
@@ -47,23 +44,18 @@ if (HAVE_CUDA)
 
   ocv_cuda_compile(cuda_objs ${lib_cuda} ${ncv_cuda})
 
-  #CUDA_BUILD_CLEAN_TARGET()
-
   set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 
-  if(NOT APPLE)
-    unset(CUDA_nvcuvid_LIBRARY CACHE)
-    find_cuda_helper_libs(nvcuvid)
+  if(WITH_NVCUVID)
     set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY})
   endif()
 
   if(WIN32)
-    unset(CUDA_nvcuvenc_LIBRARY CACHE)
     find_cuda_helper_libs(nvcuvenc)
     set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvenc_LIBRARY})
   endif()
 
-  if(NOT APPLE AND WITH_FFMPEG)
+  if(WITH_FFMPEG)
     set(cuda_link_libs ${cuda_link_libs} ${HIGHGUI_LIBRARIES})
   endif()
 else()
diff --git a/modules/gpu/app/nv_perf_test/CMakeLists.txt b/modules/gpu/app/nv_perf_test/CMakeLists.txt
new file mode 100644
index 0000000000..c13f5ef46b
--- /dev/null
+++ b/modules/gpu/app/nv_perf_test/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 2.8.3)
+
+project(nv_perf_test)
+
+find_package(OpenCV REQUIRED)
+include_directories(${OpenCV_INCLUDE_DIR})
+
+add_executable(${PROJECT_NAME} main.cpp)
+
+target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
diff --git a/modules/gpu/app/nv_perf_test/im1_1280x800.jpg b/modules/gpu/app/nv_perf_test/im1_1280x800.jpg
new file mode 100644
index 0000000000..bdbbd4aee9
Binary files /dev/null and b/modules/gpu/app/nv_perf_test/im1_1280x800.jpg differ
diff --git a/modules/gpu/app/nv_perf_test/im2_1280x800.jpg b/modules/gpu/app/nv_perf_test/im2_1280x800.jpg
new file mode 100644
index 0000000000..ae49640a95
Binary files /dev/null and b/modules/gpu/app/nv_perf_test/im2_1280x800.jpg differ
diff --git a/modules/gpu/app/nv_perf_test/main.cpp b/modules/gpu/app/nv_perf_test/main.cpp
new file mode 100644
index 0000000000..928b30a19e
--- /dev/null
+++ b/modules/gpu/app/nv_perf_test/main.cpp
@@ -0,0 +1,489 @@
+#include <cstdio>
+#define HAVE_CUDA 1
+#include <opencv2/core/core.hpp>
+#include <opencv2/gpu/gpu.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/video/video.hpp>
+#include <opencv2/legacy/legacy.hpp>
+#include <opencv2/ts/ts.hpp>
+#include <opencv2/ts/ts_perf.hpp>
+
+static void printOsInfo()
+{
+#if defined _WIN32
+#   if defined _WIN64
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"); fflush(stdout);
+#   else
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"); fflush(stdout);
+#   endif
+#elif defined linux
+#   if defined _LP64
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"); fflush(stdout);
+#   else
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"); fflush(stdout);
+#   endif
+#elif defined __APPLE__
+#   if defined _LP64
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"); fflush(stdout);
+#   else
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"); fflush(stdout);
+#   endif
+#endif
+}
+
+static void printCudaInfo()
+{
+    const int deviceCount = cv::gpu::getCudaEnabledDeviceCount();
+
+    printf("[----------]\n"); fflush(stdout);
+    printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount); fflush(stdout);
+    printf("[----------]\n"); fflush(stdout);
+
+    for (int i = 0; i < deviceCount; ++i)
+    {
+        cv::gpu::DeviceInfo info(i);
+
+        printf("[----------]\n"); fflush(stdout);
+        printf("[ DEVICE   ] \t# %d %s.\n", i, info.name().c_str()); fflush(stdout);
+        printf("[          ] \tCompute capability: %d.%d\n", info.majorVersion(), info.minorVersion()); fflush(stdout);
+        printf("[          ] \tMulti Processor Count:  %d\n", info.multiProcessorCount()); fflush(stdout);
+        printf("[          ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)); fflush(stdout);
+        printf("[          ] \tFree  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory()  / 1024.0) / 1024.0)); fflush(stdout);
+        if (!info.isCompatible())
+            printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n");
+        printf("[----------]\n"); fflush(stdout);
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    printOsInfo();
+    printCudaInfo();
+
+    perf::Regression::Init("nv_perf_test");
+    perf::TestBase::Init(argc, argv);
+    testing::InitGoogleTest(&argc, argv);
+
+    return RUN_ALL_TESTS();
+}
+
+#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
+#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
+
+//////////////////////////////////////////////////////////
+// HoughLinesP
+
+DEF_PARAM_TEST_1(Image, std::string);
+
+PERF_TEST_P(Image, HoughLinesP,
+            testing::Values(std::string("im1_1280x800.jpg")))
+{
+    declare.time(30.0);
+
+    std::string fileName = GetParam();
+
+    const double rho = 1.0;
+    const double theta = 1.0;
+    const int threshold = 40;
+    const int minLineLenght = 20;
+    const int maxLineGap = 5;
+
+    cv::Mat image = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_image(image);
+        cv::gpu::GpuMat d_lines;
+        cv::gpu::HoughLinesBuf d_buf;
+
+        cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+        }
+    }
+    else
+    {
+        cv::Mat mask;
+        cv::Canny(image, mask, 50, 100);
+
+        std::vector<cv::Vec4i> lines;
+        cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
+
+        TEST_CYCLE()
+        {
+            cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// GoodFeaturesToTrack
+
+DEF_PARAM_TEST(Image_Depth, std::string, perf::MatDepth);
+
+PERF_TEST_P(Image_Depth, GoodFeaturesToTrack,
+            testing::Combine(
+                testing::Values(std::string("im1_1280x800.jpg")),
+                testing::Values(CV_8U, CV_16U)
+                ))
+{
+    declare.time(60);
+
+    const std::string fileName = std::tr1::get<0>(GetParam());
+    const int depth = std::tr1::get<1>(GetParam());
+
+    const int maxCorners = 5000;
+    const double qualityLevel = 0.05;
+    const int minDistance = 5;
+    const int blockSize = 3;
+    const bool useHarrisDetector = true;
+    const double k = 0.05;
+
+    cv::Mat src = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
+    if (src.empty())
+        FAIL() << "Unable to load source image [" << fileName << "]";
+
+    if (depth != CV_8U)
+        src.convertTo(src, depth);
+
+    cv::Mat mask(src.size(), CV_8UC1, cv::Scalar::all(1));
+    mask(cv::Rect(0, 0, 100, 100)).setTo(cv::Scalar::all(0));
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GoodFeaturesToTrackDetector_GPU d_detector(maxCorners, qualityLevel, minDistance, blockSize, useHarrisDetector, k);
+
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_mask(mask);
+        cv::gpu::GpuMat d_pts;
+
+        d_detector(d_src, d_pts, d_mask);
+
+        TEST_CYCLE()
+        {
+            d_detector(d_src, d_pts, d_mask);
+        }
+    }
+    else
+    {
+        if (depth != CV_8U)
+            FAIL() << "Unsupported depth";
+
+        cv::Mat pts;
+
+        cv::goodFeaturesToTrack(src, pts, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k);
+
+        TEST_CYCLE()
+        {
+            cv::goodFeaturesToTrack(src, pts, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// OpticalFlowPyrLKSparse
+
+typedef std::pair<std::string, std::string> string_pair;
+
+DEF_PARAM_TEST(ImagePair_Depth_GraySource, string_pair, perf::MatDepth, bool);
+
+PERF_TEST_P(ImagePair_Depth_GraySource, OpticalFlowPyrLKSparse,
+            testing::Combine(
+                testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                testing::Values(CV_8U, CV_16U),
+                testing::Bool()
+                ))
+{
+    declare.time(60);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const int depth = std::tr1::get<1>(GetParam());
+    const bool graySource = std::tr1::get<2>(GetParam());
+
+    // PyrLK params
+    const cv::Size winSize(15, 15);
+    const int maxLevel = 5;
+    const cv::TermCriteria criteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 30, 0.01);
+
+    // GoodFeaturesToTrack params
+    const int maxCorners = 5000;
+    const double qualityLevel = 0.05;
+    const int minDistance = 5;
+    const int blockSize = 3;
+    const bool useHarrisDetector = true;
+    const double k = 0.05;
+
+    cv::Mat src1 = cv::imread(fileNames.first, graySource ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, graySource ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    cv::Mat gray_src;
+    if (graySource)
+        gray_src = src1;
+    else
+        cv::cvtColor(src1, gray_src, cv::COLOR_BGR2GRAY);
+
+    cv::Mat pts;
+    cv::goodFeaturesToTrack(gray_src, pts, maxCorners, qualityLevel, minDistance, cv::noArray(), blockSize, useHarrisDetector, k);
+
+    if (depth != CV_8U)
+    {
+        src1.convertTo(src1, depth);
+        src2.convertTo(src2, depth);
+    }
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_pts(pts.reshape(2, 1));
+        cv::gpu::GpuMat d_nextPts;
+        cv::gpu::GpuMat d_status;
+
+        cv::gpu::PyrLKOpticalFlow d_pyrLK;
+        d_pyrLK.winSize = winSize;
+        d_pyrLK.maxLevel = maxLevel;
+        d_pyrLK.iters = criteria.maxCount;
+        d_pyrLK.useInitialFlow = false;
+
+        d_pyrLK.sparse(d_src1, d_src2, d_pts, d_nextPts, d_status);
+
+        TEST_CYCLE()
+        {
+            d_pyrLK.sparse(d_src1, d_src2, d_pts, d_nextPts, d_status);
+        }
+    }
+    else
+    {
+        if (depth != CV_8U)
+            FAIL() << "Unsupported depth";
+
+        cv::Mat nextPts;
+        cv::Mat status;
+
+        cv::calcOpticalFlowPyrLK(src1, src2, pts, nextPts, status, cv::noArray(), winSize, maxLevel, criteria);
+
+        TEST_CYCLE()
+        {
+            cv::calcOpticalFlowPyrLK(src1, src2, pts, nextPts, status, cv::noArray(), winSize, maxLevel, criteria);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// OpticalFlowFarneback
+
+DEF_PARAM_TEST(ImagePair_Depth, string_pair, perf::MatDepth);
+
+PERF_TEST_P(ImagePair_Depth, OpticalFlowFarneback,
+            testing::Combine(
+                testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                testing::Values(CV_8U, CV_16U)
+                ))
+{
+    declare.time(500);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const int depth = std::tr1::get<1>(GetParam());
+
+    const double pyrScale = 0.5;
+    const int numLevels = 6;
+    const int winSize = 7;
+    const int numIters = 15;
+    const int polyN = 7;
+    const double polySigma = 1.5;
+    const int flags = cv::OPTFLOW_USE_INITIAL_FLOW;
+
+    cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    if (depth != CV_8U)
+    {
+        src1.convertTo(src1, depth);
+        src2.convertTo(src2, depth);
+    }
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_u(src1.size(), CV_32FC1, cv::Scalar::all(0));
+        cv::gpu::GpuMat d_v(src1.size(), CV_32FC1, cv::Scalar::all(0));
+
+        cv::gpu::FarnebackOpticalFlow d_farneback;
+        d_farneback.pyrScale = pyrScale;
+        d_farneback.numLevels = numLevels;
+        d_farneback.winSize = winSize;
+        d_farneback.numIters = numIters;
+        d_farneback.polyN = polyN;
+        d_farneback.polySigma = polySigma;
+        d_farneback.flags = flags;
+
+        d_farneback(d_src1, d_src2, d_u, d_v);
+
+        TEST_CYCLE_N(10)
+        {
+            d_farneback(d_src1, d_src2, d_u, d_v);
+        }
+    }
+    else
+    {
+        if (depth != CV_8U)
+            FAIL() << "Unsupported depth";
+
+        cv::Mat flow(src1.size(), CV_32FC2, cv::Scalar::all(0));
+
+        cv::calcOpticalFlowFarneback(src1, src2, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
+
+        TEST_CYCLE_N(10)
+        {
+            cv::calcOpticalFlowFarneback(src1, src2, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// OpticalFlowBM
+
+void calcOpticalFlowBM(const cv::Mat& prev, const cv::Mat& curr,
+                       cv::Size bSize, cv::Size shiftSize, cv::Size maxRange, int usePrevious,
+                       cv::Mat& velx, cv::Mat& vely)
+{
+    cv::Size sz((curr.cols - bSize.width + shiftSize.width)/shiftSize.width, (curr.rows - bSize.height + shiftSize.height)/shiftSize.height);
+
+    velx.create(sz, CV_32FC1);
+    vely.create(sz, CV_32FC1);
+
+    CvMat cvprev = prev;
+    CvMat cvcurr = curr;
+
+    CvMat cvvelx = velx;
+    CvMat cvvely = vely;
+
+    cvCalcOpticalFlowBM(&cvprev, &cvcurr, bSize, shiftSize, maxRange, usePrevious, &cvvelx, &cvvely);
+}
+
+DEF_PARAM_TEST(ImagePair_BlockSize_ShiftSize_MaxRange, string_pair, cv::Size, cv::Size, cv::Size);
+
+PERF_TEST_P(ImagePair_BlockSize_ShiftSize_MaxRange, OpticalFlowBM,
+            testing::Combine(
+                testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                testing::Values(cv::Size(16, 16)),
+                testing::Values(cv::Size(2, 2)),
+                testing::Values(cv::Size(16, 16))
+                ))
+{
+    declare.time(1000);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const cv::Size block_size = std::tr1::get<1>(GetParam());
+    const cv::Size shift_size = std::tr1::get<2>(GetParam());
+    const cv::Size max_range = std::tr1::get<3>(GetParam());
+
+    cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_velx, d_vely, buf;
+
+        cv::gpu::calcOpticalFlowBM(d_src1, d_src2, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
+
+        TEST_CYCLE_N(10)
+        {
+            cv::gpu::calcOpticalFlowBM(d_src1, d_src2, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
+        }
+    }
+    else
+    {
+        cv::Mat velx, vely;
+
+        calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+
+        TEST_CYCLE_N(10)
+        {
+            calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+PERF_TEST_P(ImagePair_BlockSize_ShiftSize_MaxRange, FastOpticalFlowBM,
+            testing::Combine(
+                testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                testing::Values(cv::Size(16, 16)),
+                testing::Values(cv::Size(1, 1)),
+                testing::Values(cv::Size(16, 16))
+                ))
+{
+    declare.time(1000);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const cv::Size block_size = std::tr1::get<1>(GetParam());
+    const cv::Size shift_size = std::tr1::get<2>(GetParam());
+    const cv::Size max_range = std::tr1::get<3>(GetParam());
+
+    cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_velx, d_vely;
+
+        cv::gpu::FastOpticalFlowBM fastBM;
+
+        fastBM(d_src1, d_src2, d_velx, d_vely, max_range.width, block_size.width);
+
+        TEST_CYCLE_N(10)
+        {
+            fastBM(d_src1, d_src2, d_velx, d_vely, max_range.width, block_size.width);
+        }
+    }
+    else
+    {
+        cv::Mat velx, vely;
+
+        calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+
+        TEST_CYCLE_N(10)
+        {
+            calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
diff --git a/modules/gpu/doc/object_detection.rst b/modules/gpu/doc/object_detection.rst
index 133660236a..a1118b780a 100644
--- a/modules/gpu/doc/object_detection.rst
+++ b/modules/gpu/doc/object_detection.rst
@@ -199,6 +199,91 @@ Returns block descriptors computed for the whole image.
 The function is mainly used to learn the classifier.
 
 
+Soft Cascade Classifier
+==========================
+
+Soft Cascade Classifier for Object Detection
+----------------------------------------------------------
+
+Cascade detectors have been shown to operate extremely rapidly, with high accuracy, and have important applications in different spheres. The initial goal for this cascade implementation was the fast and accurate pedestrian detector but it also useful in general. Soft cascade is trained with AdaBoost. But instead of training sequence of stages, the soft cascade is trained as a one long stage of T weak classifiers. Soft cascade is formulated as follows:
+
+.. math::
+    \texttt{H}(x) = \sum _{\texttt{t}=1..\texttt{T}} {\texttt{s}_t(x)}
+
+where :math:`\texttt{s}_t(x) = \alpha_t\texttt{h}_t(x)` are the set of thresholded weak classifiers selected during AdaBoost training scaled by the associated weights. Let
+
+.. math::
+    \texttt{H}_t(x) = \sum _{\texttt{i}=1..\texttt{t}} {\texttt{s}_i(x)}
+
+be the partial sum of sample responses before :math:`t`-the weak classifier will be applied. The funtcion :math:`\texttt{H}_t(x)` of :math:`t` for sample :math:`x` named *sample trace*.
+After each weak classifier evaluation, the sample trace at the point :math:`t` is compared with the rejection threshold :math:`r_t`. The sequence of :math:`r_t` named *rejection trace*.
+
+The sample has been rejected if it fall rejection threshold. So stageless cascade allows to reject not-object sample as soon as possible. Another meaning of the sample trace is a confidence with that sample recognized as desired object. At each :math:`t` that confidence depend on all previous weak classifier. This feature of soft cascade is resulted in more accurate detection. The original formulation of soft cascade can be found in [BJ05]_.
+
+.. [BJ05] Lubomir Bourdev and Jonathan Brandt. tRobust Object Detection Via Soft Cascade. IEEE CVPR, 2005.
+.. [BMTG12] Rodrigo Benenson, Markus Mathias, Radu Timofte and Luc Van Gool. Pedestrian detection at 100 frames per second. IEEE CVPR, 2012.
+
+
+gpu::SCascade
+-----------------------------------------------
+.. ocv:class:: gpu::SCascade : public Algorithm
+
+Implementation of soft (stageless) cascaded detector. ::
+
+    class CV_EXPORTS SCascade : public Algorithm
+    {
+        struct CV_EXPORTS Detection
+        {
+              ushort x;
+              ushort y;
+              ushort w;
+              ushort h;
+              float confidence;
+              int kind;
+
+              enum {PEDESTRIAN = 0};
+        };
+
+        SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1);
+        virtual ~SCascade();
+        virtual bool load(const FileNode& fn);
+        virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
+        virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
+    };
+
+
+gpu::SCascade::~SCascade
+---------------------------
+Destructor for SCascade.
+
+.. ocv:function:: gpu::SCascade::~SCascade()
+
+
+
+gpu::SCascade::load
+--------------------------
+Load cascade from FileNode.
+
+.. ocv:function:: bool gpu::SCascade::load(const FileNode& fn)
+
+    :param fn: File node from which the soft cascade are read.
+
+
+
+gpu::SCascade::detect
+--------------------------
+Apply cascade to an input frame and return the vector of Decection objcts.
+
+.. ocv:function:: void gpu::SCascade::detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const
+
+    :param image: a frame on which detector will be applied.
+
+    :param rois: a regions of interests mask generated by genRoi. Only the objects that fall into one of the regions will be returned.
+
+    :param objects: an output array of Detections represented as GpuMat of detections (SCascade::Detection). The first element of the matrix is  actually a count of detections.
+
+    :param stream: a high-level CUDA stream abstraction used for asynchronous execution.
+
 
 gpu::CascadeClassifier_GPU
 --------------------------
diff --git a/modules/gpu/src/opencv2/gpu/device/block.hpp b/modules/gpu/include/opencv2/gpu/device/block.hpp
similarity index 100%
rename from modules/gpu/src/opencv2/gpu/device/block.hpp
rename to modules/gpu/include/opencv2/gpu/device/block.hpp
diff --git a/modules/gpu/include/opencv2/gpu/device/common.hpp b/modules/gpu/include/opencv2/gpu/device/common.hpp
index 141467fdc8..931e4247e9 100644
--- a/modules/gpu/include/opencv2/gpu/device/common.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/common.hpp
@@ -85,8 +85,6 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int
         cv::gpu::error(cudaGetErrorString(err), file, line, func);
 }
 
-#ifdef __CUDACC__
-
 namespace cv { namespace gpu
 {
     __host__ __device__ __forceinline__ int divUp(int total, int grain)
@@ -96,19 +94,25 @@ namespace cv { namespace gpu
 
     namespace device
     {
+        using cv::gpu::divUp;
+
+#ifdef __CUDACC__
         typedef unsigned char uchar;
         typedef unsigned short ushort;
         typedef signed char schar;
-        typedef unsigned int uint;
+        #ifdef WIN32
+            typedef unsigned int uint;
+        #endif
 
         template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
         {
             cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
             cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
         }
+#endif // __CUDACC__
     }
 }}
 
-#endif // __CUDACC__
+
 
 #endif // __OPENCV_GPU_COMMON_HPP__
diff --git a/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp b/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
index 981e62335c..fb3bfeb9ee 100644
--- a/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
@@ -807,9 +807,9 @@ namespace cv { namespace gpu { namespace device
 
         template <int bidx, typename T, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst)
         {
+            dst.z = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift));
             dst.x = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[0] + src[1] * c_RGB2XYZ_D65i[1] + src[bidx] * c_RGB2XYZ_D65i[2], xyz_shift));
             dst.y = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[3] + src[1] * c_RGB2XYZ_D65i[4] + src[bidx] * c_RGB2XYZ_D65i[5], xyz_shift));
-            dst.z = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift));
         }
 
         template <int bidx> static __device__ __forceinline__ uint RGB2XYZConvert(uint src)
diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
new file mode 100644
index 0000000000..091a160e31
--- /dev/null
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
@@ -0,0 +1,361 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__
+#define __OPENCV_GPU_REDUCE_DETAIL_HPP__
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace reduce_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val);
+
+                For<I + 1, N>::loadToSmem(smem, val, tid);
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(val) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, val, tid);
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::merge(smem, val, tid, delta, op);
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
+                thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::mergeShfl(val, delta, width, op);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+        };
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            smem[tid] = val;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            val = smem[tid];
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                       const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                       unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                         const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                         unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
+        }
+
+        template <typename T, class Op>
+        __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
+        {
+            T reg = smem[tid + delta];
+            smem[tid] = val = op(val, reg);
+        }
+        template <typename T, class Op>
+        __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
+        {
+            T reg = shfl_down(val, delta, width);
+            val = op(val, reg);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                              const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                              unsigned int tid,
+                                              unsigned int delta,
+                                              const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
+        }
+        template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                  unsigned int delta,
+                                                  unsigned int width,
+                                                  const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
+        }
+
+        template <unsigned int N> struct Generic
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                loadToSmem(smem, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(smem, val, tid, 1024, op);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(smem, val, tid, 512, op);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(smem, val, tid, 256, op);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(smem, val, tid, 128, op);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(smem, val, tid, 64, op);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(smem, val, tid, 32, op);
+                }
+
+                if (tid < 16)
+                {
+                    merge(smem, val, tid, 16, op);
+                    merge(smem, val, tid, 8, op);
+                    merge(smem, val, tid, 4, op);
+                    merge(smem, val, tid, 2, op);
+                    merge(smem, val, tid, 1, op);
+                }
+            }
+        };
+
+        template <unsigned int I, typename Pointer, typename Reference, class Op>
+        struct Unroll
+        {
+            static __device__ void loopShfl(Reference val, Op op, unsigned int N)
+            {
+                mergeShfl(val, I, N, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            }
+            static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                merge(smem, val, tid, I, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            }
+        };
+        template <typename Pointer, typename Reference, class Op>
+        struct Unroll<0, Pointer, Reference, Op>
+        {
+            static __device__ void loopShfl(Reference, Op, unsigned int)
+            {
+            }
+            static __device__ void loop(Pointer, Reference, unsigned int, Op)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+            #if __CUDA_ARCH__ >= 300
+                (void) smem;
+                (void) tid;
+
+                Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if __CUDA_ARCH__ >= 300
+                Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(smem, val, tid);
+
+                if (tid < 32)
+                {
+                #if __CUDA_ARCH__ >= 300
+                    Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
+                #else
+                    Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__
diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
new file mode 100644
index 0000000000..a84e0c2fd0
--- /dev/null
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
@@ -0,0 +1,498 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
+#define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace reduce_key_val_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(data);
+
+                For<I + 1, N>::loadToSmem(smem, data, tid);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(data) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, data, tid);
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
+            {
+                thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+
+                For<I + 1, N>::copyShfl(val, delta, width);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
+            {
+                thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+
+                For<I + 1, N>::copy(svals, val, tid, delta);
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(key) = reg;
+                    thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+                }
+
+                For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
+                                         const ValPointerTuple& svals, const ValReferenceTuple& val,
+                                         const CmpTuple& cmp,
+                                         unsigned int tid, unsigned int delta)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
+                    thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+                }
+
+                For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
+            {
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
+            {
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
+                                         const ValPointerTuple&, const ValReferenceTuple&,
+                                         const CmpTuple&,
+                                         unsigned int, unsigned int)
+            {
+            }
+        };
+
+        //////////////////////////////////////////////////////
+        // loadToSmem
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            smem[tid] = data;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            data = smem[tid];
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                   const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                   unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                     const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                     unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
+        }
+
+        //////////////////////////////////////////////////////
+        // copyVals
+
+        template <typename V>
+        __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
+        {
+            val = shfl_down(val, delta, width);
+        }
+        template <typename V>
+        __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
+        {
+            svals[tid] = val = svals[tid + delta];
+        }
+        template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                     unsigned int delta,
+                                                     int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
+        }
+
+        //////////////////////////////////////////////////////
+        // merge
+
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename K,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const Cmp& cmp,
+                                                  unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                                  unsigned int delta, int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
+        }
+        template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+                  typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                              const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                              unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
+        }
+
+        //////////////////////////////////////////////////////
+        // Generic
+
+        template <unsigned int N> struct Generic
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                loadToSmem(skeys, key, tid);
+                loadValsToSmem(svals, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(skeys, key, svals, val, cmp, tid, 1024);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(skeys, key, svals, val, cmp, tid, 512);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(skeys, key, svals, val, cmp, tid, 256);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(skeys, key, svals, val, cmp, tid, 128);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(skeys, key, svals, val, cmp, tid, 64);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(skeys, key, svals, val, cmp, tid, 32);
+                }
+
+                if (tid < 16)
+                {
+                    merge(skeys, key, svals, val, cmp, tid, 16);
+                    merge(skeys, key, svals, val, cmp, tid, 8);
+                    merge(skeys, key, svals, val, cmp, tid, 4);
+                    merge(skeys, key, svals, val, cmp, tid, 2);
+                    merge(skeys, key, svals, val, cmp, tid, 1);
+                }
+            }
+        };
+
+        template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll
+        {
+            static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
+            {
+                mergeShfl(key, val, cmp, I, N);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            }
+            static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                merge(skeys, key, svals, val, cmp, tid, I);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            }
+        };
+        template <class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll<0, KP, KR, VP, VR, Cmp>
+        {
+            static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
+            {
+            }
+            static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+            #if 0 // __CUDA_ARCH__ >= 300
+                (void) skeys;
+                (void) svals;
+                (void) tid;
+
+                Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if 0 // __CUDA_ARCH__ >= 300
+                Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(skeys, key, tid);
+
+                if (tid < 32)
+                {
+                #if 0 // __CUDA_ARCH__ >= 300
+                    loadFromSmem(svals, val, tid);
+
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
+                #else
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+#endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp
deleted file mode 100644
index 0274f204a2..0000000000
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp
+++ /dev/null
@@ -1,841 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_REDUCTION_DETAIL_HPP__
-#define __OPENCV_GPU_REDUCTION_DETAIL_HPP__
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace utility_detail
-    {
-        ///////////////////////////////////////////////////////////////////////////////
-        // Reductor
-
-        template <int n> struct WarpReductor
-        {
-            template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                if (tid < n)
-                    data[tid] = partial_reduction;
-                if (n > 32) __syncthreads();
-
-                if (n > 32)
-                {
-                    if (tid < n - 32)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
-                    if (tid < 16)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-                else if (n > 16)
-                {
-                    if (tid < n - 16)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                    if (tid < 8)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-                else if (n > 8)
-                {
-                    if (tid < n - 8)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                    if (tid < 4)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-                else if (n > 4)
-                {
-                    if (tid < n - 4)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                    if (tid < 2)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-                else if (n > 2)
-                {
-                    if (tid < n - 2)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                    if (tid < 2)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-            }
-        };
-        template <> struct WarpReductor<64>
-        {
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                data[tid] = partial_reduction;
-                __syncthreads();
-
-                if (tid < 32)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
-                }
-            }
-        };
-        template <> struct WarpReductor<32>
-        {
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                data[tid] = partial_reduction;
-
-                if (tid < 16)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
-                }
-            }
-        };
-        template <> struct WarpReductor<16>
-        {
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                data[tid] = partial_reduction;
-
-                if (tid < 8)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
-                }
-            }
-        };
-        template <> struct WarpReductor<8>
-        {
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                data[tid] = partial_reduction;
-
-                if (tid < 4)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
-                }
-            }
-        };
-
-        template <bool warp> struct ReductionDispatcher;
-        template <> struct ReductionDispatcher<true>
-        {
-            template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                WarpReductor<n>::reduce(data, partial_reduction, tid, op);
-            }
-        };
-        template <> struct ReductionDispatcher<false>
-        {
-            template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                if (tid < n)
-                    data[tid] = partial_reduction;
-                __syncthreads();
-
-
-                if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }
-                if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }
-                if (n >= 128) { if (tid <  64) { data[tid] = partial_reduction = op(partial_reduction, data[tid +  64]); } __syncthreads(); }
-
-                if (tid < 32)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                }
-            }
-        };
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // PredValWarpReductor
-
-        template <int n> struct PredValWarpReductor;
-        template <> struct PredValWarpReductor<64>
-        {
-            template <typename T, typename V, typename Pred>
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                if (tid < 32)
-                {
-                    myData = sdata[tid];
-                    myVal = sval[tid];
-
-                    T reg = sdata[tid + 32];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 32];
-                    }
-
-                    reg = sdata[tid + 16];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 16];
-                    }
-
-                    reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 1];
-                    }
-                }
-            }
-        };
-        template <> struct PredValWarpReductor<32>
-        {
-            template <typename T, typename V, typename Pred>
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                if (tid < 16)
-                {
-                    myData = sdata[tid];
-                    myVal = sval[tid];
-
-                    T reg = sdata[tid + 16];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 16];
-                    }
-
-                    reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 1];
-                    }
-                }
-            }
-        };
-
-        template <> struct PredValWarpReductor<16>
-        {
-            template <typename T, typename V, typename Pred>
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                if (tid < 8)
-                {
-                    myData = sdata[tid];
-                    myVal = sval[tid];
-
-                    T reg = reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 1];
-                    }
-                }
-            }
-        };
-        template <> struct PredValWarpReductor<8>
-        {
-            template <typename T, typename V, typename Pred>
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                if (tid < 4)
-                {
-                    myData = sdata[tid];
-                    myVal = sval[tid];
-
-                    T reg = reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 1];
-                    }
-                }
-            }
-        };
-
-        template <bool warp> struct PredValReductionDispatcher;
-        template <> struct PredValReductionDispatcher<true>
-        {
-            template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);
-            }
-        };
-        template <> struct PredValReductionDispatcher<false>
-        {
-            template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                myData = sdata[tid];
-                myVal = sval[tid];
-
-                if (n >= 512 && tid < 256)
-                {
-                    T reg = sdata[tid + 256];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 256];
-                    }
-                    __syncthreads();
-                }
-                if (n >= 256 && tid < 128)
-                {
-                    T reg = sdata[tid + 128];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 128];
-                    }
-                    __syncthreads();
-                }
-                if (n >= 128 && tid < 64)
-                {
-                    T reg = sdata[tid + 64];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 64];
-                    }
-                    __syncthreads();
-                }
-
-                if (tid < 32)
-                {
-                    if (n >= 64)
-                    {
-                        T reg = sdata[tid + 32];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 32];
-                        }
-                    }
-                    if (n >= 32)
-                    {
-                        T reg = sdata[tid + 16];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 16];
-                        }
-                    }
-                    if (n >= 16)
-                    {
-                        T reg = sdata[tid + 8];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 8];
-                        }
-                    }
-                    if (n >= 8)
-                    {
-                        T reg = sdata[tid + 4];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 4];
-                        }
-                    }
-                    if (n >= 4)
-                    {
-                        T reg = sdata[tid + 2];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 2];
-                        }
-                    }
-                    if (n >= 2)
-                    {
-                        T reg = sdata[tid + 1];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 1];
-                        }
-                    }
-                }
-            }
-        };
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // PredVal2WarpReductor
-
-        template <int n> struct PredVal2WarpReductor;
-        template <> struct PredVal2WarpReductor<64>
-        {
-            template <typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                if (tid < 32)
-                {
-                    myData = sdata[tid];
-                    myVal1 = sval1[tid];
-                    myVal2 = sval2[tid];
-
-                    T reg = sdata[tid + 32];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 32];
-                        sval2[tid] = myVal2 = sval2[tid + 32];
-                    }
-
-                    reg = sdata[tid + 16];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 16];
-                        sval2[tid] = myVal2 = sval2[tid + 16];
-                    }
-
-                    reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 8];
-                        sval2[tid] = myVal2 = sval2[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 4];
-                        sval2[tid] = myVal2 = sval2[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 2];
-                        sval2[tid] = myVal2 = sval2[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 1];
-                        sval2[tid] = myVal2 = sval2[tid + 1];
-                    }
-                }
-            }
-        };
-        template <> struct PredVal2WarpReductor<32>
-        {
-            template <typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                if (tid < 16)
-                {
-                    myData = sdata[tid];
-                    myVal1 = sval1[tid];
-                    myVal2 = sval2[tid];
-
-                    T reg = sdata[tid + 16];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 16];
-                        sval2[tid] = myVal2 = sval2[tid + 16];
-                    }
-
-                    reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 8];
-                        sval2[tid] = myVal2 = sval2[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 4];
-                        sval2[tid] = myVal2 = sval2[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 2];
-                        sval2[tid] = myVal2 = sval2[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 1];
-                        sval2[tid] = myVal2 = sval2[tid + 1];
-                    }
-                }
-            }
-        };
-
-        template <> struct PredVal2WarpReductor<16>
-        {
-            template <typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                if (tid < 8)
-                {
-                    myData = sdata[tid];
-                    myVal1 = sval1[tid];
-                    myVal2 = sval2[tid];
-
-                    T reg = reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 8];
-                        sval2[tid] = myVal2 = sval2[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 4];
-                        sval2[tid] = myVal2 = sval2[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 2];
-                        sval2[tid] = myVal2 = sval2[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 1];
-                        sval2[tid] = myVal2 = sval2[tid + 1];
-                    }
-                }
-            }
-        };
-        template <> struct PredVal2WarpReductor<8>
-        {
-            template <typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                if (tid < 4)
-                {
-                    myData = sdata[tid];
-                    myVal1 = sval1[tid];
-                    myVal2 = sval2[tid];
-
-                    T reg = reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 4];
-                        sval2[tid] = myVal2 = sval2[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 2];
-                        sval2[tid] = myVal2 = sval2[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 1];
-                        sval2[tid] = myVal2 = sval2[tid + 1];
-                    }
-                }
-            }
-        };
-
-        template <bool warp> struct PredVal2ReductionDispatcher;
-        template <> struct PredVal2ReductionDispatcher<true>
-        {
-            template <int n, typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
-            }
-        };
-        template <> struct PredVal2ReductionDispatcher<false>
-        {
-            template <int n, typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                myData = sdata[tid];
-                myVal1 = sval1[tid];
-                myVal2 = sval2[tid];
-
-                if (n >= 512 && tid < 256)
-                {
-                    T reg = sdata[tid + 256];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 256];
-                        sval2[tid] = myVal2 = sval2[tid + 256];
-                    }
-                    __syncthreads();
-                }
-                if (n >= 256 && tid < 128)
-                {
-                    T reg = sdata[tid + 128];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 128];
-                        sval2[tid] = myVal2 = sval2[tid + 128];
-                    }
-                    __syncthreads();
-                }
-                if (n >= 128 && tid < 64)
-                {
-                    T reg = sdata[tid + 64];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 64];
-                        sval2[tid] = myVal2 = sval2[tid + 64];
-                    }
-                    __syncthreads();
-                }
-
-                if (tid < 32)
-                {
-                    if (n >= 64)
-                    {
-                        T reg = sdata[tid + 32];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 32];
-                            sval2[tid] = myVal2 = sval2[tid + 32];
-                        }
-                    }
-                    if (n >= 32)
-                    {
-                        T reg = sdata[tid + 16];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 16];
-                            sval2[tid] = myVal2 = sval2[tid + 16];
-                        }
-                    }
-                    if (n >= 16)
-                    {
-                        T reg = sdata[tid + 8];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 8];
-                            sval2[tid] = myVal2 = sval2[tid + 8];
-                        }
-                    }
-                    if (n >= 8)
-                    {
-                        T reg = sdata[tid + 4];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 4];
-                            sval2[tid] = myVal2 = sval2[tid + 4];
-                        }
-                    }
-                    if (n >= 4)
-                    {
-                        T reg = sdata[tid + 2];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 2];
-                            sval2[tid] = myVal2 = sval2[tid + 2];
-                        }
-                    }
-                    if (n >= 2)
-                    {
-                        T reg = sdata[tid + 1];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 1];
-                            sval2[tid] = myVal2 = sval2[tid + 1];
-                        }
-                    }
-                }
-            }
-        };
-    } // namespace utility_detail
-}}} // namespace cv { namespace gpu { namespace device
-
-#endif // __OPENCV_GPU_REDUCTION_DETAIL_HPP__
diff --git a/modules/gpu/include/opencv2/gpu/device/emulation.hpp b/modules/gpu/include/opencv2/gpu/device/emulation.hpp
index 074e911275..b6fba230e7 100644
--- a/modules/gpu/include/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/emulation.hpp
@@ -44,7 +44,6 @@
 #define OPENCV_GPU_EMULATION_HPP_
 
 #include "warp_reduce.hpp"
-#include <stdio.h>
 
 namespace cv { namespace gpu { namespace device
 {
diff --git a/modules/gpu/include/opencv2/gpu/device/functional.hpp b/modules/gpu/include/opencv2/gpu/device/functional.hpp
index c601cf5273..6064e8e99c 100644
--- a/modules/gpu/include/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp
@@ -302,18 +302,18 @@ namespace cv { namespace gpu { namespace device
     template <> struct name<type> : binary_function<type, type, type> \
     { \
         __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
-        __device__ __forceinline__ name(const name& other):binary_function<type, type, type>(){}\
-        __device__ __forceinline__ name():binary_function<type, type, type>(){}\
+        __device__ __forceinline__ name() {}\
+        __device__ __forceinline__ name(const name&) {}\
     };
 
     template <typename T> struct maximum : binary_function<T, T, T>
     {
         __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
         {
-            return lhs < rhs ? rhs : lhs;
+            return max(lhs, rhs);
         }
-        __device__ __forceinline__ maximum(const maximum& other):binary_function<T, T, T>(){}
-        __device__ __forceinline__ maximum():binary_function<T, T, T>(){}
+        __device__ __forceinline__ maximum() {}
+        __device__ __forceinline__ maximum(const maximum&) {}
     };
 
     OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
@@ -330,10 +330,10 @@ namespace cv { namespace gpu { namespace device
     {
         __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
         {
-            return lhs < rhs ? lhs : rhs;
+            return min(lhs, rhs);
         }
-        __device__ __forceinline__ minimum(const minimum& other):binary_function<T, T, T>(){}
-        __device__ __forceinline__ minimum():binary_function<T, T, T>(){}
+        __device__ __forceinline__ minimum() {}
+        __device__ __forceinline__ minimum(const minimum&) {}
     };
 
     OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
@@ -350,6 +350,108 @@ namespace cv { namespace gpu { namespace device
 
     // Math functions
 ///bound=========================================
+
+    template <typename T> struct abs_func : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType x) const
+        {
+            return abs(x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
+    {
+        __device__ __forceinline__ unsigned char operator ()(unsigned char x) const
+        {
+            return x;
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<signed char> : unary_function<signed char, signed char>
+    {
+        __device__ __forceinline__ signed char operator ()(signed char x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<char> : unary_function<char, char>
+    {
+        __device__ __forceinline__ char operator ()(char x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
+    {
+        __device__ __forceinline__ unsigned short operator ()(unsigned short x) const
+        {
+            return x;
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<short> : unary_function<short, short>
+    {
+        __device__ __forceinline__ short operator ()(short x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
+    {
+        __device__ __forceinline__ unsigned int operator ()(unsigned int x) const
+        {
+            return x;
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<int> : unary_function<int, int>
+    {
+        __device__ __forceinline__ int operator ()(int x) const
+        {
+            return ::abs(x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<float> : unary_function<float, float>
+    {
+        __device__ __forceinline__ float operator ()(float x) const
+        {
+            return ::fabsf(x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<double> : unary_function<double, double>
+    {
+        __device__ __forceinline__ double operator ()(double x) const
+        {
+            return ::fabs(x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+
 #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
     template <typename T> struct name ## _func : unary_function<T, float> \
     { \
@@ -357,6 +459,8 @@ namespace cv { namespace gpu { namespace device
         { \
             return func ## f(v); \
         } \
+        __device__ __forceinline__ name ## _func() {} \
+        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
     }; \
     template <> struct name ## _func<double> : unary_function<double, double> \
     { \
@@ -364,6 +468,8 @@ namespace cv { namespace gpu { namespace device
         { \
             return func(v); \
         } \
+        __device__ __forceinline__ name ## _func() {} \
+        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
     };
 
 #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
@@ -382,7 +488,6 @@ namespace cv { namespace gpu { namespace device
         } \
     };
 
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)
     OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
     OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
     OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
diff --git a/modules/gpu/include/opencv2/gpu/device/reduce.hpp b/modules/gpu/include/opencv2/gpu/device/reduce.hpp
new file mode 100644
index 0000000000..2161b06495
--- /dev/null
+++ b/modules/gpu/include/opencv2/gpu/device/reduce.hpp
@@ -0,0 +1,197 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_REDUCE_HPP__
+#define __OPENCV_GPU_REDUCE_HPP__
+
+#include <thrust/tuple.h>
+#include "detail/reduce.hpp"
+#include "detail/reduce_key_val.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    template <int N, typename T, class Op>
+    __device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
+    }
+    template <int N,
+              typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+              typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+              class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+    __device__ __forceinline__ void reduce(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                           const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                           unsigned int tid,
+                                           const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<
+                const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>&,
+                const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&,
+                const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op);
+    }
+
+    template <unsigned int N, typename K, typename V, class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+    template <unsigned int N,
+              typename K,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key,
+                                                 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&,
+                const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+                const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+                const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+    template <unsigned int N,
+              typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+              typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+    __device__ __forceinline__ void reduceKeyVal(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                                 const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid,
+                                                 const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<
+                const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>&,
+                const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&,
+                const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+                const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+                const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>&
+                >(skeys, key, svals, val, tid, cmp);
+    }
+
+    // smem_tuple
+
+    template <typename T0>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*>
+    smem_tuple(T0* t0)
+    {
+        return thrust::make_tuple((volatile T0*) t0);
+    }
+
+    template <typename T0, typename T1>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*>
+    smem_tuple(T0* t0, T1* t1)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1);
+    }
+
+    template <typename T0, typename T1, typename T2>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*>
+    smem_tuple(T0* t0, T1* t1, T2* t2)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*, volatile T9*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9);
+    }
+}}}
+
+#endif // __OPENCV_GPU_UTILITY_HPP__
diff --git a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp
index 7bb1da751f..7a2799fa37 100644
--- a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp
@@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device
 
     template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
     {
-        return (uchar) ::max((int)v, 0);
-    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
-    {
-        return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
-    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
-    {
-        return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
-    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
-    {
-        return (uchar) ::min(v, (uint)UCHAR_MAX);
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
     }
     template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
     {
-        return saturate_cast<uchar>((uint)v);
+        uint res = 0;
+        asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
     }
-
     template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
     {
-        int iv = __float2int_rn(v);
-        return saturate_cast<uchar>(iv);
+        uint res = 0;
+        asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
     }
     template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
     {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<uchar>(iv);
+    #if __CUDA_ARCH__ >= 130
+        uint res = 0;
+        asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
     #else
         return saturate_cast<uchar>((float)v);
     #endif
@@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device
 
     template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
     {
-        return (schar) ::min((int)v, SCHAR_MAX);
-    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
-    {
-        return (schar) ::min((uint)v, (uint)SCHAR_MAX);
-    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
-    {
-        return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
+        uint res = 0;
+        uint vi = v;
+        asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
     }
     template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
     {
-        return saturate_cast<schar>((int)v);
+        uint res = 0;
+        asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
     }
     template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
     {
-        return (schar) ::min(v, (uint)SCHAR_MAX);
+        uint res = 0;
+        asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
     }
-
     template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
     {
-        int iv = __float2int_rn(v);
-        return saturate_cast<schar>(iv);
+        uint res = 0;
+        asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
     }
     template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
     {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<schar>(iv);
+    #if __CUDA_ARCH__ >= 130
+        uint res = 0;
+        asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
     #else
         return saturate_cast<schar>((float)v);
     #endif
@@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device
 
     template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
     {
-        return (ushort) ::max((int)v, 0);
+        ushort res = 0;
+        int vi = v;
+        asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
+        return res;
     }
     template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
     {
-        return (ushort) ::max((int)v, 0);
+        ushort res = 0;
+        asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
     }
     template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
     {
-        return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
+        ushort res = 0;
+        asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
     }
     template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
     {
-        return (ushort) ::min(v, (uint)USHRT_MAX);
+        ushort res = 0;
+        asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
     }
     template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
     {
-        int iv = __float2int_rn(v);
-        return saturate_cast<ushort>(iv);
+        ushort res = 0;
+        asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
     }
     template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
     {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<ushort>(iv);
+    #if __CUDA_ARCH__ >= 130
+        ushort res = 0;
+        asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
     #else
         return saturate_cast<ushort>((float)v);
     #endif
@@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device
 
     template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
     {
-        return (short) ::min((int)v, SHRT_MAX);
+        short res = 0;
+        asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
     }
     template<> __device__ __forceinline__ short saturate_cast<short>(int v)
     {
-        return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
+        short res = 0;
+        asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
     }
     template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
     {
-        return (short) ::min(v, (uint)SHRT_MAX);
+        short res = 0;
+        asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
     }
     template<> __device__ __forceinline__ short saturate_cast<short>(float v)
     {
-        int iv = __float2int_rn(v);
-        return saturate_cast<short>(iv);
+        short res = 0;
+        asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
     }
     template<> __device__ __forceinline__ short saturate_cast<short>(double v)
     {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<short>(iv);
+    #if __CUDA_ARCH__ >= 130
+        short res = 0;
+        asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
     #else
         return saturate_cast<short>((float)v);
     #endif
     }
 
+    template<> __device__ __forceinline__ int saturate_cast<int>(uint v)
+    {
+        int res = 0;
+        asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
     template<> __device__ __forceinline__ int saturate_cast<int>(float v)
     {
         return __float2int_rn(v);
@@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device
     #endif
     }
 
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
+    {
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
     template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
     {
         return __float2uint_rn(v);
diff --git a/modules/gpu/include/opencv2/gpu/device/utility.hpp b/modules/gpu/include/opencv2/gpu/device/utility.hpp
index 88a73a10ea..83eaaa21ce 100644
--- a/modules/gpu/include/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp
@@ -45,7 +45,6 @@
 
 #include "saturate_cast.hpp"
 #include "datamov_utils.hpp"
-#include "detail/reduction_detail.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
@@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device
         }
     };
 
-    ///////////////////////////////////////////////////////////////////////////////
-    // Reduction
-
-    template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
-    }
-
-    template <int n, typename T, typename V, typename Pred>
-    __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
-    }
-
-    template <int n, typename T, typename V1, typename V2, typename Pred>
-    __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
-    }
-
     ///////////////////////////////////////////////////////////////////////////////
     // Solve linear system
 
diff --git a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
index b7861bca75..d5b4bb202c 100644
--- a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
 #define __OPENCV_GPU_VEC_DISTANCE_HPP__
 
-#include "utility.hpp"
+#include "reduce.hpp"
 #include "functional.hpp"
 #include "detail/vec_distance_detail.hpp"
 
@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
 
         template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
         {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
         }
 
         __device__ __forceinline__ operator int() const
@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
 
         template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
         {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
         }
 
         __device__ __forceinline__ operator float() const
@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
 
         template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
         {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
         }
 
         __device__ __forceinline__ operator float() const
@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
 
         template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
         {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
         }
 
         __device__ __forceinline__ operator int() const
diff --git a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
index 0ec790c0b7..1c46dc0c33 100644
--- a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
@@ -280,7 +280,7 @@ namespace cv { namespace gpu { namespace device
     OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \
     OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \
     OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, abs, abs_func) \
     OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \
     OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \
     OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \
@@ -327,4 +327,4 @@ namespace cv { namespace gpu { namespace device
     #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
 }}} // namespace cv { namespace gpu { namespace device
 
-#endif // __OPENCV_GPU_VECMATH_HPP__
\ No newline at end of file
+#endif // __OPENCV_GPU_VECMATH_HPP__
diff --git a/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
new file mode 100644
index 0000000000..8b4479a79b
--- /dev/null
+++ b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
@@ -0,0 +1,145 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
+#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T>
+    __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl(val, srcLane, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl((int) val, srcLane, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl(lo, srcLane, width);
+        hi = __shfl(hi, srcLane, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl_down(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_down((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl_down(lo, delta, width);
+        hi = __shfl_down(hi, delta, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl_up(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_up((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl_up(lo, delta, width);
+        hi = __shfl_up(hi, delta, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+}}}
+
+#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index ddb131788f..c6ce2faff3 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -792,31 +792,23 @@ private:
     GpuMat lab, l, ab;
 };
 
+struct CV_EXPORTS CannyBuf
+{
+    void create(const Size& image_size, int apperture_size = 3);
+    void release();
 
-struct CV_EXPORTS CannyBuf;
+    GpuMat dx, dy;
+    GpuMat mag;
+    GpuMat map;
+    GpuMat st1, st2;
+    Ptr<FilterEngine_GPU> filterDX, filterDY;
+};
 
 CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
 CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
 CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
 CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
 
-struct CV_EXPORTS CannyBuf
-{
-    CannyBuf() {}
-    explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);}
-    CannyBuf(const GpuMat& dx_, const GpuMat& dy_);
-
-    void create(const Size& image_size, int apperture_size = 3);
-
-    void release();
-
-    GpuMat dx, dy;
-    GpuMat dx_buf, dy_buf;
-    GpuMat edgeBuf;
-    GpuMat trackBuf1, trackBuf2;
-    Ptr<FilterEngine_GPU> filterDX, filterDY;
-};
-
 class CV_EXPORTS ImagePyramid
 {
 public:
@@ -855,6 +847,11 @@ CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float th
 CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
 CV_EXPORTS void HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_votes = noArray());
 
+//! HoughLinesP
+
+//! finds line segments in the black-n-white image using probabalistic Hough transform
+CV_EXPORTS void HoughLinesP(const GpuMat& image, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int minLineLength, int maxLineGap, int maxLines = 4096);
+
 //! HoughCircles
 
 struct HoughCirclesBuf
@@ -1036,11 +1033,9 @@ CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels
 //! Calculates histogram for 8u one channel image
 //! Output hist will have one row, 256 cols and CV32SC1 type.
 CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null());
-CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
 
 //! normalizes the grayscale image brightness and contrast by normalizing its histogram
 CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
-CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null());
 CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
 
 //////////////////////////////// StereoBM_GPU ////////////////////////////////
@@ -1532,6 +1527,97 @@ public:
     int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
 };
 
+// ======================== GPU version for soft cascade ===================== //
+
+class CV_EXPORTS ChannelsProcessor
+{
+public:
+    enum
+    {
+        GENERIC   = 1 << 4,
+        SEPARABLE = 2 << 4
+    };
+
+    // Appends specified number of HOG first-order features integrals into given vector.
+    // Param frame is an input 3-channel bgr image.
+    // Param channels is a GPU matrix of optionally shrinked channels
+    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution.
+    virtual void apply(InputArray frame, OutputArray channels, Stream& stream = Stream::Null()) = 0;
+
+    // Creates a specific preprocessor implementation.
+    // Param shrinkage is a resizing factor. Resize is applied before the computing integral sum
+    // Param bins is a number of HOG-like channels.
+    // Param flags is a channel computing extra flags.
+    static cv::Ptr<ChannelsProcessor> create(const int shrinkage, const int bins, const int flags = GENERIC);
+
+    virtual ~ChannelsProcessor();
+
+protected:
+    ChannelsProcessor();
+};
+
+// Implementation of soft (stageless) cascaded detector.
+class CV_EXPORTS SCascade : public Algorithm
+{
+public:
+
+    // Representation of detectors result.
+    struct CV_EXPORTS Detection
+    {
+        ushort x;
+        ushort y;
+        ushort w;
+        ushort h;
+        float confidence;
+        int kind;
+
+        enum {PEDESTRIAN = 0};
+    };
+
+    enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT, NMS_MASK = 0xF};
+
+    // An empty cascade will be created.
+    // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed.
+    // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed.
+    // Param scales is a number of scales from minScale to maxScale.
+    // Param flags is an extra tuning flags.
+    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55,
+        const int flags = NO_REJECT || ChannelsProcessor::GENERIC);
+
+    virtual ~SCascade();
+
+    cv::AlgorithmInfo* info() const;
+
+    // Load cascade from FileNode.
+    // Param fn is a root node for cascade. Should be <cascade>.
+    virtual bool load(const FileNode& fn);
+
+    // Load cascade config.
+    virtual void read(const FileNode& fn);
+
+    // Return the matrix of of detectioned objects.
+    // Param image is a frame on which detector will be applied.
+    // Param rois is a regions of interests mask generated by genRoi.
+    //    Only the objects that fall into one of the regions will be returned.
+    // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
+    //    The first element of the matrix is  actually a count of detections.
+    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
+    virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
+
+private:
+
+    struct Fields;
+    Fields* fields;
+
+    double minScale;
+    double maxScale;
+    int scales;
+
+    int flags;
+};
+
+CV_EXPORTS bool initModule_gpu(void);
+
 ////////////////////////////////// SURF //////////////////////////////////////////
 
 class CV_EXPORTS SURF_GPU
@@ -1877,8 +1963,6 @@ private:
 
     GpuMat uPyr_[2];
     GpuMat vPyr_[2];
-
-    bool isDeviceArch11_;
 };
 
 
@@ -1895,7 +1979,6 @@ public:
         polyN = 5;
         polySigma = 1.1;
         flags = 0;
-        isDeviceArch11_ = !DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
     }
 
     int numLevels;
@@ -1943,8 +2026,113 @@ private:
     GpuMat frames_[2];
     GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
     std::vector<GpuMat> pyramid0_, pyramid1_;
+};
 
-    bool isDeviceArch11_;
+
+// Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
+//
+// see reference:
+//   [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
+//   [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
+class CV_EXPORTS OpticalFlowDual_TVL1_GPU
+{
+public:
+    OpticalFlowDual_TVL1_GPU();
+
+    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy);
+
+    void collectGarbage();
+
+    /**
+     * Time step of the numerical scheme.
+     */
+    double tau;
+
+    /**
+     * Weight parameter for the data term, attachment parameter.
+     * This is the most relevant parameter, which determines the smoothness of the output.
+     * The smaller this parameter is, the smoother the solutions we obtain.
+     * It depends on the range of motions of the images, so its value should be adapted to each image sequence.
+     */
+    double lambda;
+
+    /**
+     * Weight parameter for (u - v)^2, tightness parameter.
+     * It serves as a link between the attachment and the regularization terms.
+     * In theory, it should have a small value in order to maintain both parts in correspondence.
+     * The method is stable for a large range of values of this parameter.
+     */
+    double theta;
+
+    /**
+     * Number of scales used to create the pyramid of images.
+     */
+    int nscales;
+
+    /**
+     * Number of warpings per scale.
+     * Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale.
+     * This is a parameter that assures the stability of the method.
+     * It also affects the running time, so it is a compromise between speed and accuracy.
+     */
+    int warps;
+
+    /**
+     * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
+     * A small value will yield more accurate solutions at the expense of a slower convergence.
+     */
+    double epsilon;
+
+    /**
+     * Stopping criterion iterations number used in the numerical scheme.
+     */
+    int iterations;
+
+    bool useInitialFlow;
+
+private:
+    void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2);
+
+    std::vector<GpuMat> I0s;
+    std::vector<GpuMat> I1s;
+    std::vector<GpuMat> u1s;
+    std::vector<GpuMat> u2s;
+
+    GpuMat I1x_buf;
+    GpuMat I1y_buf;
+
+    GpuMat I1w_buf;
+    GpuMat I1wx_buf;
+    GpuMat I1wy_buf;
+
+    GpuMat grad_buf;
+    GpuMat rho_c_buf;
+
+    GpuMat p11_buf;
+    GpuMat p12_buf;
+    GpuMat p21_buf;
+    GpuMat p22_buf;
+
+    GpuMat diff_buf;
+    GpuMat norm_buf;
+};
+
+
+//! Calculates optical flow for 2 images using block matching algorithm */
+CV_EXPORTS void calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr,
+                                  Size block_size, Size shift_size, Size max_range, bool use_previous,
+                                  GpuMat& velx, GpuMat& vely, GpuMat& buf,
+                                  Stream& stream = Stream::Null());
+
+class CV_EXPORTS FastOpticalFlowBM
+{
+public:
+    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window = 21, int block_window = 7, Stream& s = Stream::Null());
+
+private:
+    GpuMat buffer;
+    GpuMat extended_I0;
+    GpuMat extended_I1;
 };
 
 
diff --git a/modules/gpu/misc/carma.toolchain.cmake b/modules/gpu/misc/carma.toolchain.cmake
new file mode 100644
index 0000000000..18f0e0f934
--- /dev/null
+++ b/modules/gpu/misc/carma.toolchain.cmake
@@ -0,0 +1,26 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_VERSION 1)
+set(CMAKE_SYSTEM_PROCESSOR arm)
+
+set(CMAKE_C_COMPILER    arm-linux-gnueabi-gcc-4.5)
+set(CMAKE_CXX_COMPILER  arm-linux-gnueabi-g++-4.5)
+
+#suppress compiller varning
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-psabi" )
+set( CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -Wno-psabi" )
+
+# can be any other plases
+set(__arm_linux_eabi_root /usr/arm-linux-gnueabi)
+
+set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${__arm_linux_eabi_root})
+
+if(EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
+    set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${CUDA_TOOLKIT_ROOT_DIR})
+endif()
+
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+
+set(CARMA 1)
+add_definitions(-DCARMA)
diff --git a/modules/gpu/perf/perf_core.cpp b/modules/gpu/perf/perf_core.cpp
index 725bb9b3d3..ad722fa3b5 100644
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
@@ -28,27 +28,17 @@ PERF_TEST_P(Sz_Depth_Cn, Core_Merge, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT_D
 
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::merge(d_src, d_dst);
+        TEST_CYCLE() cv::gpu::merge(d_src, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::merge(d_src, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-12);
     }
     else
     {
         cv::Mat dst;
 
-        cv::merge(src, dst);
+        TEST_CYCLE() cv::merge(src, dst);
 
-        TEST_CYCLE()
-        {
-            cv::merge(src, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-12);
     }
 }
 
@@ -69,28 +59,18 @@ PERF_TEST_P(Sz_Depth_Cn, Core_Split, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT_D
 
         std::vector<cv::gpu::GpuMat> d_dst;
 
-        cv::gpu::split(d_src, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::split(d_src, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::split(d_src, d_dst);
 
         cv::gpu::GpuMat first = d_dst[0];
-        GPU_SANITY_CHECK(first);
+        GPU_SANITY_CHECK(first, 1e-12);
     }
     else
     {
         std::vector<cv::Mat> dst;
 
-        cv::split(src, dst);
+        TEST_CYCLE() cv::split(src, dst);
 
-        TEST_CYCLE()
-        {
-            cv::split(src, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-12);
     }
 }
 
@@ -114,27 +94,17 @@ PERF_TEST_P(Sz_Depth, Core_AddMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT_DEP
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::add(d_src1, d_src2, d_dst);
+        TEST_CYCLE() cv::gpu::add(d_src1, d_src2, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::add(d_src1, d_src2, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::add(src1, src2, dst);
+        TEST_CYCLE() cv::add(src1, src2, dst);
 
-        TEST_CYCLE()
-        {
-            cv::add(src1, src2, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -156,27 +126,17 @@ PERF_TEST_P(Sz_Depth, Core_AddScalar, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT_
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::add(d_src, s, d_dst);
+        TEST_CYCLE() cv::gpu::add(d_src, s, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::add(d_src, s, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::add(src, s, dst);
+        TEST_CYCLE() cv::add(src, s, dst);
 
-        TEST_CYCLE()
-        {
-            cv::add(src, s, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -200,27 +160,17 @@ PERF_TEST_P(Sz_Depth, Core_SubtractMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MA
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::subtract(d_src1, d_src2, d_dst);
+        TEST_CYCLE() cv::gpu::subtract(d_src1, d_src2, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::subtract(d_src1, d_src2, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::subtract(src1, src2, dst);
+        TEST_CYCLE() cv::subtract(src1, src2, dst);
 
-        TEST_CYCLE()
-        {
-            cv::subtract(src1, src2, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -242,27 +192,17 @@ PERF_TEST_P(Sz_Depth, Core_SubtractScalar, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::subtract(d_src, s, d_dst);
+        TEST_CYCLE() cv::gpu::subtract(d_src, s, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::subtract(d_src, s, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::subtract(src, s, dst);
+        TEST_CYCLE() cv::subtract(src, s, dst);
 
-        TEST_CYCLE()
-        {
-            cv::subtract(src, s, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -286,27 +226,17 @@ PERF_TEST_P(Sz_Depth, Core_MultiplyMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MA
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::multiply(d_src1, d_src2, d_dst);
+        TEST_CYCLE() cv::gpu::multiply(d_src1, d_src2, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::multiply(d_src1, d_src2, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::multiply(src1, src2, dst);
+        TEST_CYCLE() cv::multiply(src1, src2, dst);
 
-        TEST_CYCLE()
-        {
-            cv::multiply(src1, src2, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -330,25 +260,17 @@ PERF_TEST_P(Sz_Depth, Core_MultiplyScalar, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM
 
         cv::gpu::multiply(d_src, s, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::multiply(d_src, s, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::multiply(d_src, s, d_dst);
 
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::multiply(src, s, dst);
+        TEST_CYCLE() cv::multiply(src, s, dst);
 
-        TEST_CYCLE()
-        {
-            cv::multiply(src, s, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -372,27 +294,17 @@ PERF_TEST_P(Sz_Depth, Core_DivideMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT_
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::divide(d_src1, d_src2, d_dst);
+        TEST_CYCLE() cv::gpu::divide(d_src1, d_src2, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::divide(d_src1, d_src2, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::divide(src1, src2, dst);
+        TEST_CYCLE() cv::divide(src1, src2, dst);
 
-        TEST_CYCLE()
-        {
-            cv::divide(src1, src2, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -414,27 +326,17 @@ PERF_TEST_P(Sz_Depth, Core_DivideScalar, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_M
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::divide(d_src, s, d_dst);
+        TEST_CYCLE() cv::gpu::divide(d_src, s, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::divide(d_src, s, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::divide(src, s, dst);
+        TEST_CYCLE() cv::divide(src, s, dst);
 
-        TEST_CYCLE()
-        {
-            cv::divide(src, s, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -456,27 +358,17 @@ PERF_TEST_P(Sz_Depth, Core_DivideScalarInv, Combine(GPU_TYPICAL_MAT_SIZES, ARITH
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::divide(s, d_src, d_dst);
+        TEST_CYCLE() cv::gpu::divide(s, d_src, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::divide(s, d_src, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::divide(s, src, dst);
+        TEST_CYCLE() cv::divide(s, src, dst);
 
-        TEST_CYCLE()
-        {
-            cv::divide(s, src, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -500,27 +392,17 @@ PERF_TEST_P(Sz_Depth, Core_AbsDiffMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::absdiff(d_src1, d_src2, d_dst);
+        TEST_CYCLE() cv::gpu::absdiff(d_src1, d_src2, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::absdiff(d_src1, d_src2, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::absdiff(src1, src2, dst);
+        TEST_CYCLE() cv::absdiff(src1, src2, dst);
 
-        TEST_CYCLE()
-        {
-            cv::absdiff(src1, src2, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -542,27 +424,17 @@ PERF_TEST_P(Sz_Depth, Core_AbsDiffScalar, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::absdiff(d_src, s, d_dst);
+        TEST_CYCLE() cv::gpu::absdiff(d_src, s, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::absdiff(d_src, s, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::absdiff(src, s, dst);
+        TEST_CYCLE() cv::absdiff(src, s, dst);
 
-        TEST_CYCLE()
-        {
-            cv::absdiff(src, s, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -582,19 +454,11 @@ PERF_TEST_P(Sz_Depth, Core_Abs, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_16S, CV
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::abs(d_src, d_dst);
+        TEST_CYCLE() cv::gpu::abs(d_src, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::abs(d_src, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        FAIL() << "No such CPU implementation analogy";
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
+    else FAIL_NO_CPU();
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -613,19 +477,11 @@ PERF_TEST_P(Sz_Depth, Core_Sqr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::sqr(d_src, d_dst);
+        TEST_CYCLE() cv::gpu::sqr(d_src, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::sqr(d_src, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        FAIL() << "No such CPU implementation analogy";
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
+    else FAIL_NO_CPU();
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -644,27 +500,17 @@ PERF_TEST_P(Sz_Depth, Core_Sqrt, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::sqrt(d_src, d_dst);
+        TEST_CYCLE() cv::gpu::sqrt(d_src, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::sqrt(d_src, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::sqrt(src, dst);
+        TEST_CYCLE() cv::sqrt(src, dst);
 
-        TEST_CYCLE()
-        {
-            cv::sqrt(src, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -684,27 +530,17 @@ PERF_TEST_P(Sz_Depth, Core_Log, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::log(d_src, d_dst);
+        TEST_CYCLE() cv::gpu::log(d_src, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::log(d_src, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::log(src, dst);
+        TEST_CYCLE() cv::log(src, dst);
 
-        TEST_CYCLE()
-        {
-            cv::log(src, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -724,27 +560,17 @@ PERF_TEST_P(Sz_Depth, Core_Exp, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::exp(d_src, d_dst);
+        TEST_CYCLE() cv::gpu::exp(d_src, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::exp(d_src, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::exp(src, dst);
+        TEST_CYCLE() TEST_CYCLE() cv::exp(src, dst);
 
-        TEST_CYCLE()
-        {
-            cv::exp(src, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -767,27 +593,17 @@ PERF_TEST_P(Sz_Depth_Power, Core_Pow, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::pow(d_src, power, d_dst);
+        TEST_CYCLE() cv::gpu::pow(d_src, power, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::pow(d_src, power, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::pow(src, power, dst);
+        TEST_CYCLE() cv::pow(src, power,dst);
 
-        TEST_CYCLE()
-        {
-            cv::pow(src, power, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -817,12 +633,7 @@ PERF_TEST_P(Sz_Depth_Code, Core_CompareMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITH
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::compare(d_src1, d_src2, d_dst, cmp_code);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::compare(d_src1, d_src2, d_dst, cmp_code);
-        }
+        TEST_CYCLE() cv::gpu::compare(d_src1, d_src2, d_dst, cmp_code);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -830,12 +641,7 @@ PERF_TEST_P(Sz_Depth_Code, Core_CompareMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITH
     {
         cv::Mat dst;
 
-        cv::compare(src1, src2, dst, cmp_code);
-
-        TEST_CYCLE()
-        {
-            cv::compare(src1, src2, dst, cmp_code);
-        }
+        TEST_CYCLE() cv::compare(src1, src2, dst, cmp_code);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -860,12 +666,7 @@ PERF_TEST_P(Sz_Depth_Code, Core_CompareScalar, Combine(GPU_TYPICAL_MAT_SIZES, AR
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::compare(d_src, s, d_dst, cmp_code);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::compare(d_src, s, d_dst, cmp_code);
-        }
+        TEST_CYCLE() cv::gpu::compare(d_src, s, d_dst, cmp_code);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -873,12 +674,7 @@ PERF_TEST_P(Sz_Depth_Code, Core_CompareScalar, Combine(GPU_TYPICAL_MAT_SIZES, AR
     {
         cv::Mat dst;
 
-        cv::compare(src, s, dst, cmp_code);
-
-        TEST_CYCLE()
-        {
-            cv::compare(src, s, dst, cmp_code);
-        }
+        TEST_CYCLE() cv::compare(src, s, dst, cmp_code);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -900,12 +696,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseNot, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::bitwise_not(d_src, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::bitwise_not(d_src, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::bitwise_not(d_src,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -913,12 +704,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseNot, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_
     {
         cv::Mat dst;
 
-        cv::bitwise_not(src, dst);
-
-        TEST_CYCLE()
-        {
-            cv::bitwise_not(src, dst);
-        }
+        TEST_CYCLE() cv::bitwise_not(src,dst);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -944,12 +730,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseAndMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::bitwise_and(d_src1, d_src2, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::bitwise_and(d_src1, d_src2, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::bitwise_and(d_src1, d_src2,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -957,12 +738,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseAndMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(
     {
         cv::Mat dst;
 
-        cv::bitwise_and(src1, src2, dst);
-
-        TEST_CYCLE()
-        {
-            cv::bitwise_and(src1, src2, dst);
-        }
+        TEST_CYCLE() cv::bitwise_and(src1, src2,dst);
     }
 }
 
@@ -987,12 +763,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseAndScalar, Combine(GPU_TYPICAL_MAT_SIZES, V
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::bitwise_and(d_src, s, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::bitwise_and(d_src, s, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::bitwise_and(d_src, s,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -1000,12 +771,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseAndScalar, Combine(GPU_TYPICAL_MAT_SIZES, V
     {
         cv::Mat dst;
 
-        cv::bitwise_and(src, s, dst);
-
-        TEST_CYCLE()
-        {
-            cv::bitwise_and(src, s, dst);
-        }
+        TEST_CYCLE() cv::bitwise_and(src, s,dst);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -1031,12 +797,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseOrMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(C
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::bitwise_or(d_src1, d_src2, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::bitwise_or(d_src1, d_src2, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::bitwise_or(d_src1, d_src2,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -1044,12 +805,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseOrMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(C
     {
         cv::Mat dst;
 
-        cv::bitwise_or(src1, src2, dst);
-
-        TEST_CYCLE()
-        {
-            cv::bitwise_or(src1, src2, dst);
-        }
+        TEST_CYCLE() cv::bitwise_or(src1, src2,dst);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -1076,12 +832,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseOrScalar, Combine(GPU_TYPICAL_MAT_SIZES, Va
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::bitwise_or(d_src, s, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::bitwise_or(d_src, s, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::bitwise_or(d_src, s,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -1089,12 +840,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseOrScalar, Combine(GPU_TYPICAL_MAT_SIZES, Va
     {
         cv::Mat dst;
 
-        cv::bitwise_or(src, s, dst);
-
-        TEST_CYCLE()
-        {
-            cv::bitwise_or(src, s, dst);
-        }
+        TEST_CYCLE() cv::bitwise_or(src, s,dst);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -1120,12 +866,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseXorMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::bitwise_xor(d_src1, d_src2, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::bitwise_xor(d_src1, d_src2, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::bitwise_xor(d_src1, d_src2,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -1133,12 +874,7 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseXorMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(
     {
         cv::Mat dst;
 
-        cv::bitwise_xor(src1, src2, dst);
-
-        TEST_CYCLE()
-        {
-            cv::bitwise_xor(src1, src2, dst);
-        }
+        TEST_CYCLE() cv::bitwise_xor(src1, src2,dst);
     }
 }
 
@@ -1163,12 +899,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseXorScalar, Combine(GPU_TYPICAL_MAT_SIZES, V
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::bitwise_xor(d_src, s, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::bitwise_xor(d_src, s, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::bitwise_xor(d_src, s,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -1176,12 +907,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseXorScalar, Combine(GPU_TYPICAL_MAT_SIZES, V
     {
         cv::Mat dst;
 
-        cv::bitwise_xor(src, s, dst);
-
-        TEST_CYCLE()
-        {
-            cv::bitwise_xor(src, s, dst);
-        }
+        TEST_CYCLE() cv::bitwise_xor(src, s,dst);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -1208,18 +934,13 @@ PERF_TEST_P(Sz_Depth_Cn, Core_RShift, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::rshift(d_src, val, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::rshift(d_src, val, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::rshift(d_src, val,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
     else
     {
-        FAIL() << "No such CPU implementation analogy";
+        FAIL_NO_CPU();
     }
 }
 
@@ -1244,18 +965,13 @@ PERF_TEST_P(Sz_Depth_Cn, Core_LShift, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::lshift(d_src, val, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::lshift(d_src, val, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::lshift(d_src, val,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
     else
     {
-        FAIL() << "No such CPU implementation analogy";
+        FAIL_NO_CPU();
     }
 }
 
@@ -1279,12 +995,7 @@ PERF_TEST_P(Sz_Depth, Core_MinMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U,
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::min(d_src1, d_src2, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::min(d_src1, d_src2, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::min(d_src1, d_src2,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -1292,12 +1003,7 @@ PERF_TEST_P(Sz_Depth, Core_MinMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U,
     {
         cv::Mat dst;
 
-        cv::min(src1, src2, dst);
-
-        TEST_CYCLE()
-        {
-            cv::min(src1, src2, dst);
-        }
+        TEST_CYCLE() cv::min(src1, src2,dst);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -1321,12 +1027,7 @@ PERF_TEST_P(Sz_Depth, Core_MinScalar, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::min(d_src, val, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::min(d_src, val, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::min(d_src, val,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -1334,12 +1035,7 @@ PERF_TEST_P(Sz_Depth, Core_MinScalar, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
     {
         cv::Mat dst;
 
-        cv::min(src, val, dst);
-
-        TEST_CYCLE()
-        {
-            cv::min(src, val, dst);
-        }
+        TEST_CYCLE() cv::min(src, val,dst);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -1365,12 +1061,7 @@ PERF_TEST_P(Sz_Depth, Core_MaxMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U,
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::max(d_src1, d_src2, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::max(d_src1, d_src2, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::max(d_src1, d_src2,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -1378,12 +1069,7 @@ PERF_TEST_P(Sz_Depth, Core_MaxMat, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U,
     {
         cv::Mat dst;
 
-        cv::max(src1, src2, dst);
-
-        TEST_CYCLE()
-        {
-            cv::max(src1, src2, dst);
-        }
+        TEST_CYCLE() cv::max(src1, src2,dst);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -1407,12 +1093,7 @@ PERF_TEST_P(Sz_Depth, Core_MaxScalar, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::max(d_src, val, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::max(d_src, val, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::max(d_src, val,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -1420,12 +1101,7 @@ PERF_TEST_P(Sz_Depth, Core_MaxScalar, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
     {
         cv::Mat dst;
 
-        cv::max(src, val, dst);
-
-        TEST_CYCLE()
-        {
-            cv::max(src, val, dst);
-        }
+        TEST_CYCLE() cv::max(src, val,dst);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -1459,12 +1135,7 @@ PERF_TEST_P(Sz_3Depth, Core_AddWeighted, Combine(
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::addWeighted(d_src1, 0.5, d_src2, 0.5, 10.0, d_dst, dst_depth);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::addWeighted(d_src1, 0.5, d_src2, 0.5, 10.0, d_dst, dst_depth);
-        }
+        TEST_CYCLE() cv::gpu::addWeighted(d_src1, 0.5, d_src2, 0.5, 10.0, d_dst, dst_depth);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -1472,12 +1143,7 @@ PERF_TEST_P(Sz_3Depth, Core_AddWeighted, Combine(
     {
         cv::Mat dst;
 
-        cv::addWeighted(src1, 0.5, src2, 0.5, 10.0, dst, dst_depth);
-
-        TEST_CYCLE()
-        {
-            cv::addWeighted(src1, 0.5, src2, 0.5, 10.0, dst, dst_depth);
-        }
+        TEST_CYCLE() cv::addWeighted(src1, 0.5, src2, 0.5, 10.0, dst, dst_depth);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -1518,29 +1184,19 @@ PERF_TEST_P(Sz_Type_Flags, Core_GEMM, Combine(
         cv::gpu::GpuMat d_src3(src3);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst, flags);
+        TEST_CYCLE() cv::gpu::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst, flags);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst, flags);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::gemm(src1, src2, 1.0, src3, 1.0, dst, flags);
-
         declare.time(50.0);
 
-        TEST_CYCLE()
-        {
-            cv::gemm(src1, src2, 1.0, src3, 1.0, dst, flags);
-        }
+        TEST_CYCLE() cv::gemm(src1, src2, 1.0, src3, 1.0, dst, flags);
 
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -1562,12 +1218,7 @@ PERF_TEST_P(Sz_Type, Core_Transpose, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::transpose(d_src, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::transpose(d_src, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::transpose(d_src,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -1575,12 +1226,7 @@ PERF_TEST_P(Sz_Type, Core_Transpose, Combine(
     {
         cv::Mat dst;
 
-        cv::transpose(src, dst);
-
-        TEST_CYCLE()
-        {
-            cv::transpose(src, dst);
-        }
+        TEST_CYCLE() cv::transpose(src,dst);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -1616,12 +1262,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code, Core_Flip, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::flip(d_src, d_dst, flipCode);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::flip(d_src, d_dst, flipCode);
-        }
+        TEST_CYCLE() cv::gpu::flip(d_src, d_dst, flipCode);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -1629,12 +1270,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code, Core_Flip, Combine(
     {
         cv::Mat dst;
 
-        cv::flip(src, dst, flipCode);
-
-        TEST_CYCLE()
-        {
-            cv::flip(src, dst, flipCode);
-        }
+        TEST_CYCLE() cv::flip(src, dst, flipCode);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -1661,12 +1297,7 @@ PERF_TEST_P(Sz_Type, Core_LutOneChannel, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::LUT(d_src, lut, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::LUT(d_src, lut, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::LUT(d_src, lut,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -1674,12 +1305,7 @@ PERF_TEST_P(Sz_Type, Core_LutOneChannel, Combine(
     {
         cv::Mat dst;
 
-        cv::LUT(src, lut, dst);
-
-        TEST_CYCLE()
-        {
-            cv::LUT(src, lut, dst);
-        }
+        TEST_CYCLE() cv::LUT(src, lut, dst);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -1706,12 +1332,7 @@ PERF_TEST_P(Sz_Type, Core_LutMultiChannel, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::LUT(d_src, lut, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::LUT(d_src, lut, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::LUT(d_src, lut,d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
@@ -1719,12 +1340,7 @@ PERF_TEST_P(Sz_Type, Core_LutMultiChannel, Combine(
     {
         cv::Mat dst;
 
-        cv::LUT(src, lut, dst);
-
-        TEST_CYCLE()
-        {
-            cv::LUT(src, lut, dst);
-        }
+        TEST_CYCLE() cv::LUT(src, lut, dst);
 
         CPU_SANITY_CHECK(dst);
     }
@@ -1745,14 +1361,9 @@ PERF_TEST_P(Sz, Core_MagnitudeComplex, GPU_TYPICAL_MAT_SIZES)
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::magnitude(d_src, d_dst);
+        TEST_CYCLE() cv::gpu::magnitude(d_src,d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::magnitude(d_src, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
@@ -1761,14 +1372,9 @@ PERF_TEST_P(Sz, Core_MagnitudeComplex, GPU_TYPICAL_MAT_SIZES)
 
         cv::Mat dst;
 
-        cv::magnitude(xy[0], xy[1], dst);
+        TEST_CYCLE() cv::magnitude(xy[0], xy[1], dst);
 
-        TEST_CYCLE()
-        {
-            cv::magnitude(xy[0], xy[1], dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -1787,18 +1393,13 @@ PERF_TEST_P(Sz, Core_MagnitudeSqrComplex, GPU_TYPICAL_MAT_SIZES)
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::magnitudeSqr(d_src, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::magnitudeSqr(d_src, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::magnitudeSqr(d_src, d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
     else
     {
-        FAIL() << "No such CPU implementation analogy";
+        FAIL_NO_CPU();
     }
 }
 
@@ -1821,27 +1422,17 @@ PERF_TEST_P(Sz, Core_Magnitude, GPU_TYPICAL_MAT_SIZES)
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::magnitude(d_src1, d_src2, d_dst);
+        TEST_CYCLE() cv::gpu::magnitude(d_src1, d_src2, d_dst);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::magnitude(d_src1, d_src2, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::magnitude(src1, src2, dst);
+        TEST_CYCLE() cv::magnitude(src1, src2, dst);
 
-        TEST_CYCLE()
-        {
-            cv::magnitude(src1, src2, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
 
     }
 }
@@ -1865,18 +1456,13 @@ PERF_TEST_P(Sz, Core_MagnitudeSqr, GPU_TYPICAL_MAT_SIZES)
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::magnitudeSqr(d_src1, d_src2, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::magnitudeSqr(d_src1, d_src2, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::magnitudeSqr(d_src1, d_src2, d_dst);
 
         GPU_SANITY_CHECK(d_dst);
     }
     else
     {
-        FAIL() << "No such CPU implementation analogy";
+        FAIL_NO_CPU();
     }
 }
 
@@ -1902,27 +1488,17 @@ PERF_TEST_P(Sz_AngleInDegrees, Core_Phase, Combine(GPU_TYPICAL_MAT_SIZES, Bool()
         cv::gpu::GpuMat d_src2(src2);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::phase(d_src1, d_src2, d_dst, angleInDegrees);
+        TEST_CYCLE() cv::gpu::phase(d_src1, d_src2, d_dst, angleInDegrees);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::phase(d_src1, d_src2, d_dst, angleInDegrees);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1e-8);
     }
     else
     {
         cv::Mat dst;
 
-        cv::phase(src1, src2, dst, angleInDegrees);
+        TEST_CYCLE() cv::phase(src1, src2, dst, angleInDegrees);
 
-        TEST_CYCLE()
-        {
-            cv::phase(src1, src2, dst, angleInDegrees);
-        }
-
-        CPU_SANITY_CHECK(dst);
+        CPU_SANITY_CHECK(dst, 1e-8);
     }
 }
 
@@ -1947,15 +1523,10 @@ PERF_TEST_P(Sz_AngleInDegrees, Core_CartToPolar, Combine(GPU_TYPICAL_MAT_SIZES,
         cv::gpu::GpuMat d_magnitude;
         cv::gpu::GpuMat d_angle;
 
-        cv::gpu::cartToPolar(d_src1, d_src2, d_magnitude, d_angle, angleInDegrees);
+        TEST_CYCLE() cv::gpu::cartToPolar(d_src1, d_src2, d_magnitude, d_angle, angleInDegrees);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::cartToPolar(d_src1, d_src2, d_magnitude, d_angle, angleInDegrees);
-        }
-
-        GPU_SANITY_CHECK(d_magnitude);
-        GPU_SANITY_CHECK(d_angle);
+        GPU_SANITY_CHECK(d_magnitude, 1e-8);
+        GPU_SANITY_CHECK(d_angle, 1e-8);
 
     }
     else
@@ -1963,15 +1534,10 @@ PERF_TEST_P(Sz_AngleInDegrees, Core_CartToPolar, Combine(GPU_TYPICAL_MAT_SIZES,
         cv::Mat magnitude;
         cv::Mat angle;
 
-        cv::cartToPolar(src1, src2, magnitude, angle, angleInDegrees);
+        TEST_CYCLE() cv::cartToPolar(src1, src2, magnitude, angle, angleInDegrees);
 
-        TEST_CYCLE()
-        {
-            cv::cartToPolar(src1, src2, magnitude, angle, angleInDegrees);
-        }
-
-        CPU_SANITY_CHECK(magnitude);
-        CPU_SANITY_CHECK(angle);
+        CPU_SANITY_CHECK(magnitude, 1e-8);
+        CPU_SANITY_CHECK(angle, 1e-8);
     }
 }
 
@@ -1996,30 +1562,20 @@ PERF_TEST_P(Sz_AngleInDegrees, Core_PolarToCart, Combine(GPU_TYPICAL_MAT_SIZES,
         cv::gpu::GpuMat d_x;
         cv::gpu::GpuMat d_y;
 
-        cv::gpu::polarToCart(d_magnitude, d_angle, d_x, d_y, angleInDegrees);
+        TEST_CYCLE() cv::gpu::polarToCart(d_magnitude, d_angle, d_x, d_y, angleInDegrees);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::polarToCart(d_magnitude, d_angle, d_x, d_y, angleInDegrees);
-        }
-
-        GPU_SANITY_CHECK(d_x);
-        GPU_SANITY_CHECK(d_y);
+        GPU_SANITY_CHECK(d_x, 1e-8);
+        GPU_SANITY_CHECK(d_y, 1e-8);
     }
     else
     {
         cv::Mat x;
         cv::Mat y;
 
-        cv::polarToCart(magnitude, angle, x, y, angleInDegrees);
+        TEST_CYCLE() cv::polarToCart(magnitude, angle, x, y, angleInDegrees);
 
-        TEST_CYCLE()
-        {
-            cv::polarToCart(magnitude, angle, x, y, angleInDegrees);
-        }
-
-        CPU_SANITY_CHECK(x);
-        CPU_SANITY_CHECK(y);
+        CPU_SANITY_CHECK(x, 1e-8);
+        CPU_SANITY_CHECK(y, 1e-8);
     }
 }
 
@@ -2041,24 +1597,14 @@ PERF_TEST_P(Sz, Core_MeanStdDev, GPU_TYPICAL_MAT_SIZES)
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        cv::gpu::meanStdDev(d_src, mean, stddev, d_buf);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::meanStdDev(d_src, mean, stddev, d_buf);
-        }
+        TEST_CYCLE() cv::gpu::meanStdDev(d_src, mean, stddev, d_buf);
     }
     else
     {
-        cv::meanStdDev(src, mean, stddev);
-
-        TEST_CYCLE()
-        {
-            cv::meanStdDev(src, mean, stddev);
-        }
+        TEST_CYCLE() cv::meanStdDev(src, mean, stddev);
     }
 
-    GPU_SANITY_CHECK(stddev);
+    GPU_SANITY_CHECK(stddev, 1e-6);
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -2085,24 +1631,14 @@ PERF_TEST_P(Sz_Depth_Norm, Core_Norm, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        dst = cv::gpu::norm(d_src, normType, d_buf);
-
-        TEST_CYCLE()
-        {
-            dst = cv::gpu::norm(d_src, normType, d_buf);
-        }
+        TEST_CYCLE() dst = cv::gpu::norm(d_src, normType, d_buf);
     }
     else
     {
-        dst = cv::norm(src, normType);
-
-        TEST_CYCLE()
-        {
-            dst = cv::norm(src, normType);
-        }
+        TEST_CYCLE() dst = cv::norm(src, normType);
     }
 
-    SANITY_CHECK(dst);
+    SANITY_CHECK(dst, 1e-6);
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -2130,25 +1666,15 @@ PERF_TEST_P(Sz_Norm, Core_NormDiff, Combine(
         cv::gpu::GpuMat d_src1(src1);
         cv::gpu::GpuMat d_src2(src2);
 
-        dst = cv::gpu::norm(d_src1, d_src2, normType);
-
-        TEST_CYCLE()
-        {
-            dst = cv::gpu::norm(d_src1, d_src2, normType);
-        }
+        TEST_CYCLE() dst = cv::gpu::norm(d_src1, d_src2, normType);
 
     }
     else
     {
-        dst = cv::norm(src1, src2, normType);
-
-        TEST_CYCLE()
-        {
-            dst = cv::norm(src1, src2, normType);
-        }
+        TEST_CYCLE() dst = cv::norm(src1, src2, normType);
     }
 
-    SANITY_CHECK(dst);
+    SANITY_CHECK(dst, 1e-6);
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -2175,24 +1701,14 @@ PERF_TEST_P(Sz_Depth_Cn, Core_Sum, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        dst = cv::gpu::sum(d_src, d_buf);
-
-        TEST_CYCLE()
-        {
-            dst = cv::gpu::sum(d_src, d_buf);
-        }
+        TEST_CYCLE() dst = cv::gpu::sum(d_src, d_buf);
     }
     else
     {
-        dst = cv::sum(src);
-
-        TEST_CYCLE()
-        {
-            dst = cv::sum(src);
-        }
+        TEST_CYCLE() dst = cv::sum(src);
     }
 
-    SANITY_CHECK(dst);
+    SANITY_CHECK(dst, 1e-6);
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -2219,18 +1735,13 @@ PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        dst = cv::gpu::absSum(d_src, d_buf);
+        TEST_CYCLE() dst = cv::gpu::absSum(d_src, d_buf);
 
-        TEST_CYCLE()
-        {
-            dst = cv::gpu::absSum(d_src, d_buf);
-        }
-
-        SANITY_CHECK(dst);
+        SANITY_CHECK(dst, 1e-6);
     }
     else
     {
-        FAIL() << "No such CPU implementation analogy";
+        FAIL_NO_CPU();
     }
 }
 
@@ -2258,18 +1769,13 @@ PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        dst = cv::gpu::sqrSum(d_src, d_buf);
+        TEST_CYCLE() dst = cv::gpu::sqrSum(d_src, d_buf);
 
-        TEST_CYCLE()
-        {
-            dst = cv::gpu::sqrSum(d_src, d_buf);
-        }
-
-        SANITY_CHECK(dst);
+        SANITY_CHECK(dst, 1e-6);
     }
     else
     {
-        FAIL() << "No such CPU implementation analogy";
+        FAIL_NO_CPU();
     }
 }
 
@@ -2293,19 +1799,14 @@ PERF_TEST_P(Sz_Depth, Core_MinMax, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        cv::gpu::minMax(d_src, &minVal, &maxVal, cv::gpu::GpuMat(), d_buf);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::minMax(d_src, &minVal, &maxVal, cv::gpu::GpuMat(), d_buf);
-        }
+        TEST_CYCLE() cv::gpu::minMax(d_src, &minVal, &maxVal, cv::gpu::GpuMat(), d_buf);
 
         SANITY_CHECK(minVal);
         SANITY_CHECK(maxVal);
     }
     else
     {
-        FAIL() << "No such CPU implementation analogy";
+        FAIL_NO_CPU();
     }
 }
 
@@ -2330,25 +1831,15 @@ PERF_TEST_P(Sz_Depth, Core_MinMaxLoc, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_valbuf, d_locbuf;
 
-        cv::gpu::minMaxLoc(d_src, &minVal, &maxVal, &minLoc, &maxLoc, cv::gpu::GpuMat(), d_valbuf, d_locbuf);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::minMaxLoc(d_src, &minVal, &maxVal, &minLoc, &maxLoc, cv::gpu::GpuMat(), d_valbuf, d_locbuf);
-        }
+        TEST_CYCLE() cv::gpu::minMaxLoc(d_src, &minVal, &maxVal, &minLoc, &maxLoc, cv::gpu::GpuMat(), d_valbuf, d_locbuf);
     }
     else
     {
-        cv::minMaxLoc(src, &minVal, &maxVal, &minLoc, &maxLoc);
-
-        TEST_CYCLE()
-        {
-            cv::minMaxLoc(src, &minVal, &maxVal, &minLoc, &maxLoc);
-        }
+        TEST_CYCLE() cv::minMaxLoc(src, &minVal, &maxVal, &minLoc, &maxLoc);
     }
 
-    SANITY_CHECK(minVal);
-    SANITY_CHECK(maxVal);
+    SANITY_CHECK(minVal, 1e-12);
+    SANITY_CHECK(maxVal, 1e-12);
 
     // unsupported by peft system
     //SANITY_CHECK(minLoc);
@@ -2368,28 +1859,18 @@ PERF_TEST_P(Sz_Depth, Core_CountNonZero, Combine(
     cv::Mat src(size, depth);
     fillRandom(src);
 
-    int dst;
+    int dst = 0;
 
     if (PERF_RUN_GPU())
     {
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        dst = cv::gpu::countNonZero(d_src, d_buf);
-
-        TEST_CYCLE()
-        {
-            dst = cv::gpu::countNonZero(d_src, d_buf);
-        }
+        TEST_CYCLE() dst = cv::gpu::countNonZero(d_src, d_buf);
     }
     else
     {
-        dst = cv::countNonZero(src);
-
-        TEST_CYCLE()
-        {
-            dst = cv::countNonZero(src);
-        }
+        TEST_CYCLE() dst = cv::countNonZero(src);
     }
 
     SANITY_CHECK(dst);
@@ -2430,25 +1911,17 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_dst;
 
-        cv::gpu::reduce(d_src, d_dst, dim, reduceOp);
+        TEST_CYCLE() cv::gpu::reduce(d_src, d_dst, dim, reduceOp);
 
-        TEST_CYCLE()
-        {
-            cv::gpu::reduce(d_src, d_dst, dim, reduceOp);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(d_dst, 1);
     }
     else
     {
         cv::Mat dst;
 
-        cv::reduce(src, dst, dim, reduceOp);
+        TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp);
 
-        TEST_CYCLE()
-        {
-            cv::reduce(src, dst, dim, reduceOp);
-        }
+        CPU_SANITY_CHECK(dst, 1);
     }
 }
 
diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp
index 30377e148f..ee0968442c 100644
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -581,13 +581,12 @@ PERF_TEST_P(Sz, ImgProc_CalcHist, GPU_TYPICAL_MAT_SIZES)
     {
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_hist;
-        cv::gpu::GpuMat d_buf;
 
-        cv::gpu::calcHist(d_src, d_hist, d_buf);
+        cv::gpu::calcHist(d_src, d_hist);
 
         TEST_CYCLE()
         {
-            cv::gpu::calcHist(d_src, d_hist, d_buf);
+            cv::gpu::calcHist(d_src, d_hist);
         }
 
         GPU_SANITY_CHECK(d_hist);
@@ -1706,10 +1705,40 @@ PERF_TEST_P(Sz_Depth_Cn, ImgProc_ImagePyramidGetLayer, Combine(GPU_TYPICAL_MAT_S
     }
 }
 
+namespace {
+    struct Vec4iComparator
+    {
+        bool operator()(const cv::Vec4i& a, const cv::Vec4i b) const
+        {
+            if (a[0] != b[0]) return a[0] < b[0];
+            else if(a[1] != b[1]) return a[1] < b[1];
+            else if(a[2] != b[2]) return a[2] < b[2];
+            else return a[3] < b[3];
+        }
+    };
+    struct Vec3fComparator
+    {
+        bool operator()(const cv::Vec3f& a, const cv::Vec3f b) const
+        {
+            if(a[0] != b[0]) return a[0] < b[0];
+            else if(a[1] != b[1]) return a[1] < b[1];
+            else return a[2] < b[2];
+        }
+    };
+    struct Vec2fComparator
+    {
+        bool operator()(const cv::Vec2f& a, const cv::Vec2f b) const
+        {
+            if(a[0] != b[0]) return a[0] < b[0];
+            else return a[1] < b[1];
+        }
+    };
+}
+
 //////////////////////////////////////////////////////////////////////
 // HoughLines
 
-PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
+PERF_TEST_P(Sz, ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
 {
     declare.time(30.0);
 
@@ -1744,7 +1773,11 @@ PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
             cv::gpu::HoughLines(d_src, d_lines, d_buf, rho, theta, threshold);
         }
 
-        GPU_SANITY_CHECK(d_lines);
+        cv::Mat h_lines(d_lines);
+        cv::Vec2f* begin = (cv::Vec2f*)(h_lines.ptr<char>(0));
+        cv::Vec2f* end = (cv::Vec2f*)(h_lines.ptr<char>(0) + (h_lines.cols) * 2 * sizeof(float));
+        std::sort(begin, end, Vec2fComparator());
+        SANITY_CHECK(h_lines);
     }
     else
     {
@@ -1756,7 +1789,64 @@ PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
             cv::HoughLines(src, lines, rho, theta, threshold);
         }
 
-        CPU_SANITY_CHECK(lines);
+        std::sort(lines.begin(), lines.end(), Vec2fComparator());
+        SANITY_CHECK(lines);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// HoughLinesP
+
+DEF_PARAM_TEST_1(Image, std::string);
+
+PERF_TEST_P(Image, ImgProc_HoughLinesP, testing::Values("cv/shared/pic5.png", "stitching/a1.png"))
+{
+    declare.time(30.0);
+
+    std::string fileName = getDataPath(GetParam());
+
+    const double rho = 1.0f;
+    const double theta = CV_PI / 180.0;
+    const int threshold = 100;
+    const int minLineLenght = 50;
+    const int maxLineGap = 5;
+
+    cv::Mat image = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
+
+    cv::Mat mask;
+    cv::Canny(image, mask, 50, 100);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_mask(mask);
+        cv::gpu::GpuMat d_lines;
+        cv::gpu::HoughLinesBuf d_buf;
+
+        cv::gpu::HoughLinesP(d_mask, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::HoughLinesP(d_mask, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+        }
+
+        cv::Mat h_lines(d_lines);
+        cv::Vec4i* begin = h_lines.ptr<cv::Vec4i>();
+        cv::Vec4i* end = h_lines.ptr<cv::Vec4i>() + h_lines.cols;
+        std::sort(begin, end, Vec4iComparator());
+        SANITY_CHECK(h_lines);
+    }
+    else
+    {
+        std::vector<cv::Vec4i> lines;
+        cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
+
+        TEST_CYCLE()
+        {
+            cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
+        }
+
+        std::sort(lines.begin(), lines.end(), Vec4iComparator());
+        SANITY_CHECK(lines);
     }
 }
 
@@ -1804,7 +1894,11 @@ PERF_TEST_P(Sz_Dp_MinDist, ImgProc_HoughCircles, Combine(GPU_TYPICAL_MAT_SIZES,
             cv::gpu::HoughCircles(d_src, d_circles, d_buf, CV_HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
         }
 
-        GPU_SANITY_CHECK(d_circles);
+        cv::Mat h_circles(d_circles);
+        cv::Vec3f* begin = (cv::Vec3f*)(h_circles.ptr<char>(0));
+        cv::Vec3f* end = (cv::Vec3f*)(h_circles.ptr<char>(0) + (h_circles.cols) * 3 * sizeof(float));
+        std::sort(begin, end, Vec3fComparator());
+        SANITY_CHECK(h_circles);
     }
     else
     {
@@ -1817,7 +1911,8 @@ PERF_TEST_P(Sz_Dp_MinDist, ImgProc_HoughCircles, Combine(GPU_TYPICAL_MAT_SIZES,
             cv::HoughCircles(src, circles, CV_HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
         }
 
-        CPU_SANITY_CHECK(circles);
+        std::sort(circles.begin(), circles.end(), Vec3fComparator());
+        SANITY_CHECK(circles);
     }
 }
 
diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp
index 6b864a3e52..6d040ac02f 100644
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -89,7 +89,6 @@ PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gp
     SANITY_CHECK(found_locations);
 }
 
-
 ///////////////////////////////////////////////////////////////
 // HaarClassifier
 
@@ -181,4 +180,4 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
     }
 }
 
-} // namespace
+} // namespace
\ No newline at end of file
diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp
new file mode 100644
index 0000000000..e9437d70f9
--- /dev/null
+++ b/modules/gpu/perf/perf_softcascade.cpp
@@ -0,0 +1,279 @@
+#include "perf_precomp.hpp"
+
+#define GPU_PERF_TEST_P(fixture, name, params)  \
+    class fixture##_##name : public fixture {\
+     public:\
+      fixture##_##name() {}\
+     protected:\
+        virtual void __cpu();\
+        virtual void __gpu();\
+      virtual void PerfTestBody();\
+    };\
+    TEST_P(fixture##_##name, name /*perf*/){ RunPerfTestBody(); }\
+    INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);\
+    void fixture##_##name::PerfTestBody() { if (PERF_RUN_GPU()) __gpu(); else __cpu(); }
+
+#define RUN_CPU(fixture, name)\
+    void fixture##_##name::__cpu()
+
+#define RUN_GPU(fixture, name)\
+    void fixture##_##name::__gpu()
+
+#define NO_CPU(fixture, name)\
+void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";}
+
+namespace {
+    struct DetectionLess
+    {
+        bool operator()(const cv::gpu::SCascade::Detection& a,
+            const cv::gpu::SCascade::Detection& b) const
+        {
+            if (a.x != b.x) return a.x < b.x;
+            else if (a.y != b.y) return a.y < b.y;
+            else if (a.w != b.w) return a.w < b.w;
+            else return a.h < b.h;
+        }
+    };
+
+    cv::Mat sortDetections(cv::gpu::GpuMat& objects)
+    {
+        cv::Mat detections(objects);
+
+        typedef cv::gpu::SCascade::Detection Detection;
+        Detection* begin = (Detection*)(detections.ptr<char>(0));
+        Detection* end = (Detection*)(detections.ptr<char>(0) + detections.cols);
+        std::sort(begin, end, DetectionLess());
+
+        return detections;
+    }
+}
+
+
+typedef std::tr1::tuple<std::string, std::string> fixture_t;
+typedef perf::TestBaseWithParam<fixture_t> SCascadeTest;
+
+GPU_PERF_TEST_P(SCascadeTest, detect,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png"))))
+
+RUN_GPU(SCascadeTest, detect)
+{
+    cv::Mat cpu = readImage (GET_PARAM(1));
+    ASSERT_FALSE(cpu.empty());
+    cv::gpu::GpuMat colored(cpu);
+
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
+    rois.setTo(1);
+
+    cascade.detect(colored, rois, objectBoxes);
+
+    TEST_CYCLE()
+    {
+        cascade.detect(colored, rois, objectBoxes);
+    }
+
+    SANITY_CHECK(sortDetections(objectBoxes));
+}
+
+NO_CPU(SCascadeTest, detect)
+
+static cv::Rect getFromTable(int idx)
+{
+    static const cv::Rect rois[] =
+    {
+        cv::Rect( 65 * 4,  20 * 4,  35 * 4, 80 * 4),
+        cv::Rect( 95 * 4,  35 * 4,  45 * 4, 40 * 4),
+        cv::Rect( 45 * 4,  35 * 4,  45 * 4, 40 * 4),
+        cv::Rect( 25 * 4,  27 * 4,  50 * 4, 45 * 4),
+        cv::Rect(100 * 4,  50 * 4,  45 * 4, 40 * 4),
+
+        cv::Rect( 60 * 4,  30 * 4,  45 * 4, 40 * 4),
+        cv::Rect( 40 * 4,  55 * 4,  50 * 4, 40 * 4),
+        cv::Rect( 48 * 4,  37 * 4,  72 * 4, 80 * 4),
+        cv::Rect( 48 * 4,  32 * 4,  85 * 4, 58 * 4),
+        cv::Rect( 48 * 4,   0 * 4,  32 * 4, 27 * 4)
+    };
+
+    return rois[idx];
+}
+
+typedef std::tr1::tuple<std::string, std::string, int> roi_fixture_t;
+typedef perf::TestBaseWithParam<roi_fixture_t> SCascadeTestRoi;
+
+GPU_PERF_TEST_P(SCascadeTestRoi, detectInRoi,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
+        testing::Range(0, 5)))
+
+RUN_GPU(SCascadeTestRoi, detectInRoi)
+{
+    cv::Mat cpu = readImage (GET_PARAM(1));
+    ASSERT_FALSE(cpu.empty());
+    cv::gpu::GpuMat colored(cpu);
+
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
+    rois.setTo(0);
+
+    int nroi = GET_PARAM(2);
+    cv::RNG rng;
+    for (int i = 0; i < nroi; ++i)
+    {
+        cv::Rect r = getFromTable(rng(10));
+        cv::gpu::GpuMat sub(rois, r);
+        sub.setTo(1);
+    }
+
+    cascade.detect(colored, rois, objectBoxes);
+
+    TEST_CYCLE()
+    {
+        cascade.detect(colored, rois, objectBoxes);
+    }
+
+    SANITY_CHECK(sortDetections(objectBoxes));
+}
+
+NO_CPU(SCascadeTestRoi, detectInRoi)
+
+
+GPU_PERF_TEST_P(SCascadeTestRoi, detectEachRoi,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
+        testing::Range(0, 10)))
+
+RUN_GPU(SCascadeTestRoi, detectEachRoi)
+{
+    cv::Mat cpu = readImage (GET_PARAM(1));
+    ASSERT_FALSE(cpu.empty());
+    cv::gpu::GpuMat colored(cpu);
+
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
+    rois.setTo(0);
+
+    int idx = GET_PARAM(2);
+    cv::Rect r = getFromTable(idx);
+    cv::gpu::GpuMat sub(rois, r);
+    sub.setTo(1);
+
+    cascade.detect(colored, rois, objectBoxes);
+
+    TEST_CYCLE()
+    {
+        cascade.detect(colored, rois, objectBoxes);
+    }
+
+    SANITY_CHECK(sortDetections(objectBoxes));
+}
+
+NO_CPU(SCascadeTestRoi, detectEachRoi)
+
+GPU_PERF_TEST_P(SCascadeTest, detectOnIntegral,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/integrals.xml"))))
+
+static std::string itoa(long i)
+{
+    static char s[65];
+    sprintf(s, "%ld", i);
+    return std::string(s);
+}
+
+RUN_GPU(SCascadeTest, detectOnIntegral)
+{
+    cv::FileStorage fsi(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ);
+    ASSERT_TRUE(fsi.isOpened());
+
+    cv::gpu::GpuMat hogluv(121 * 10, 161, CV_32SC1);
+    for (int i = 0; i < 10; ++i)
+    {
+        cv::Mat channel;
+        fsi[std::string("channel") + itoa(i)] >> channel;
+        cv::gpu::GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121));
+        gchannel.upload(channel);
+    }
+
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(cv::Size(640, 480), CV_8UC1);
+    rois.setTo(1);
+
+    cascade.detect(hogluv, rois, objectBoxes);
+
+    TEST_CYCLE()
+    {
+        cascade.detect(hogluv, rois, objectBoxes);
+    }
+
+    SANITY_CHECK(sortDetections(objectBoxes));
+}
+
+NO_CPU(SCascadeTest, detectOnIntegral)
+
+GPU_PERF_TEST_P(SCascadeTest, detectStream,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png"))))
+
+RUN_GPU(SCascadeTest, detectStream)
+{
+    cv::Mat cpu = readImage (GET_PARAM(1));
+    ASSERT_FALSE(cpu.empty());
+    cv::gpu::GpuMat colored(cpu);
+
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
+    rois.setTo(1);
+
+    cv::gpu::Stream s;
+
+    cascade.detect(colored, rois, objectBoxes, s);
+
+    TEST_CYCLE()
+    {
+        cascade.detect(colored, rois, objectBoxes, s);
+    }
+
+#ifdef HAVE_CUDA
+    cudaDeviceSynchronize();
+#endif
+
+    SANITY_CHECK(sortDetections(objectBoxes));
+}
+
+NO_CPU(SCascadeTest, detectStream)
diff --git a/modules/gpu/perf/perf_video.cpp b/modules/gpu/perf/perf_video.cpp
index b18cb17dfb..bf2fd99c6e 100644
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
@@ -394,6 +394,173 @@ PERF_TEST_P(ImagePair, Video_FarnebackOpticalFlow,
     }
 }
 
+//////////////////////////////////////////////////////
+// OpticalFlowDual_TVL1
+
+PERF_TEST_P(ImagePair, Video_OpticalFlowDual_TVL1,
+    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
+{
+    declare.time(20);
+
+    cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_frame0(frame0);
+        cv::gpu::GpuMat d_frame1(frame1);
+        cv::gpu::GpuMat d_flowx;
+        cv::gpu::GpuMat d_flowy;
+
+        cv::gpu::OpticalFlowDual_TVL1_GPU d_alg;
+
+        d_alg(d_frame0, d_frame1, d_flowx, d_flowy);
+
+        TEST_CYCLE()
+        {
+            d_alg(d_frame0, d_frame1, d_flowx, d_flowy);
+        }
+
+        GPU_SANITY_CHECK(d_flowx);
+        GPU_SANITY_CHECK(d_flowy);
+    }
+    else
+    {
+        cv::Mat flow;
+
+        cv::OpticalFlowDual_TVL1 alg;
+
+        alg(frame0, frame1, flow);
+
+        TEST_CYCLE()
+        {
+            alg(frame0, frame1, flow);
+        }
+
+        CPU_SANITY_CHECK(flow);
+    }
+}
+
+//////////////////////////////////////////////////////
+// OpticalFlowBM
+
+void calcOpticalFlowBM(const cv::Mat& prev, const cv::Mat& curr,
+                       cv::Size bSize, cv::Size shiftSize, cv::Size maxRange, int usePrevious,
+                       cv::Mat& velx, cv::Mat& vely)
+{
+    cv::Size sz((curr.cols - bSize.width + shiftSize.width)/shiftSize.width, (curr.rows - bSize.height + shiftSize.height)/shiftSize.height);
+
+    velx.create(sz, CV_32FC1);
+    vely.create(sz, CV_32FC1);
+
+    CvMat cvprev = prev;
+    CvMat cvcurr = curr;
+
+    CvMat cvvelx = velx;
+    CvMat cvvely = vely;
+
+    cvCalcOpticalFlowBM(&cvprev, &cvcurr, bSize, shiftSize, maxRange, usePrevious, &cvvelx, &cvvely);
+}
+
+PERF_TEST_P(ImagePair, Video_OpticalFlowBM,
+    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
+{
+    declare.time(400);
+
+    cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Size block_size(16, 16);
+    cv::Size shift_size(1, 1);
+    cv::Size max_range(16, 16);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_frame0(frame0);
+        cv::gpu::GpuMat d_frame1(frame1);
+        cv::gpu::GpuMat d_velx, d_vely, buf;
+
+        cv::gpu::calcOpticalFlowBM(d_frame0, d_frame1, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::calcOpticalFlowBM(d_frame0, d_frame1, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
+        }
+
+        GPU_SANITY_CHECK(d_velx);
+        GPU_SANITY_CHECK(d_vely);
+    }
+    else
+    {
+        cv::Mat velx, vely;
+
+        calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
+
+        TEST_CYCLE()
+        {
+            calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
+        }
+
+        CPU_SANITY_CHECK(velx);
+        CPU_SANITY_CHECK(vely);
+    }
+}
+
+PERF_TEST_P(ImagePair, Video_FastOpticalFlowBM,
+    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
+{
+    declare.time(400);
+
+    cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Size block_size(16, 16);
+    cv::Size shift_size(1, 1);
+    cv::Size max_range(16, 16);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_frame0(frame0);
+        cv::gpu::GpuMat d_frame1(frame1);
+        cv::gpu::GpuMat d_velx, d_vely;
+
+        cv::gpu::FastOpticalFlowBM fastBM;
+
+        fastBM(d_frame0, d_frame1, d_velx, d_vely, max_range.width, block_size.width);
+
+        TEST_CYCLE()
+        {
+            fastBM(d_frame0, d_frame1, d_velx, d_vely, max_range.width, block_size.width);
+        }
+
+        GPU_SANITY_CHECK(d_velx);
+        GPU_SANITY_CHECK(d_vely);
+    }
+    else
+    {
+        cv::Mat velx, vely;
+
+        calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
+
+        TEST_CYCLE()
+        {
+            calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
+        }
+
+        CPU_SANITY_CHECK(velx);
+        CPU_SANITY_CHECK(vely);
+    }
+}
+
 //////////////////////////////////////////////////////
 // FGDStatModel
 
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp
index 1a10bc32eb..242febded9 100644
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -68,11 +68,16 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool,
 void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
 {
 #ifndef HAVE_CUBLAS
-    (void)src1; (void)src2; (void)alpha; (void)src3; (void)beta; (void)dst; (void)flags; (void)stream;
+    (void)src1;
+    (void)src2;
+    (void)alpha;
+    (void)src3;
+    (void)beta;
+    (void)dst;
+    (void)flags;
+    (void)stream;
     CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS");
-
 #else
-
     // CUBLAS works with column-major matrices
 
     CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
@@ -80,7 +85,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
 
     if (src1.depth() == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
@@ -188,7 +193,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
     }
 
     cublasSafeCall( cublasDestroy_v2(handle) );
-
 #endif
 }
 
@@ -227,7 +231,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
     }
     else // if (src.elemSize() == 8)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
 
         NppStStreamHandler h(stream);
diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp
index a04639715d..095a64adb4 100644
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -88,71 +88,71 @@ namespace cv { namespace gpu { namespace device
     {
         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
 
         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
     }
 
     namespace bf_knnmatch
     {
         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
 
         template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
         template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
         template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
     }
 
     namespace bf_radius_match
     {
         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
 
         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
 
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
 
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
     }
 }}}
 
@@ -202,7 +202,7 @@ void cv::gpu::BFMatcher_GPU::matchSingle(const GpuMat& query, const GpuMat& trai
 
     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                              const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);
 
     static const caller_t callersL1[] =
     {
@@ -238,10 +238,7 @@ void cv::gpu::BFMatcher_GPU::matchSingle(const GpuMat& query, const GpuMat& trai
     caller_t func = callers[query.depth()];
     CV_Assert(func != 0);
 
-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    func(query, train, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream));
+    func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector<DMatch>& matches)
@@ -348,7 +345,7 @@ void cv::gpu::BFMatcher_GPU::matchCollection(const GpuMat& query, const GpuMat&
 
     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                              const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);
 
     static const caller_t callersL1[] =
     {
@@ -383,10 +380,7 @@ void cv::gpu::BFMatcher_GPU::matchCollection(const GpuMat& query, const GpuMat&
     caller_t func = callers[query.depth()];
     CV_Assert(func != 0);
 
-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    func(query, trainCollection, masks, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
+    func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
@@ -462,7 +456,7 @@ void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& t
 
     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
                              const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);
 
     static const caller_t callersL1[] =
     {
@@ -512,10 +506,7 @@ void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& t
     caller_t func = callers[query.depth()];
     CV_Assert(func != 0);
 
-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream));
+    func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::BFMatcher_GPU::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
@@ -594,7 +585,7 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuM
 
     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                              const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);
 
     static const caller_t callersL1[] =
     {
@@ -634,10 +625,7 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuM
     caller_t func = callers[query.depth()];
     CV_Assert(func != 0);
 
-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
+    func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::BFMatcher_GPU::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
@@ -778,7 +766,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat
 
     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
                              const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);
 
     static const caller_t callersL1[] =
     {
@@ -799,12 +787,6 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat
         matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
     };
 
-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
     const int nQuery = query.rows;
     const int nTrain = train.rows;
 
@@ -830,7 +812,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat
     caller_t func = callers[query.depth()];
     CV_Assert(func != 0);
 
-    func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
+    func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
@@ -913,7 +895,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&
 
     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
                              const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);
 
     static const caller_t callersL1[] =
     {
@@ -934,12 +916,6 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&
         matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
     };
 
-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
     const int nQuery = query.rows;
 
     CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
@@ -968,7 +944,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&
     vector<PtrStepSzb> masks_(masks.begin(), masks.end());
 
     func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
-        trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
+        trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index 07e174e5cf..3603933979 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -623,7 +623,7 @@ private:
         }
 
         // copy data structures on gpu
-        stage_mat.upload(cv::Mat(1, stages.size() * sizeof(Stage), CV_8UC1, (uchar*)&(stages[0]) ));
+        stage_mat.upload(cv::Mat(1, (int) (stages.size() * sizeof(Stage)), CV_8UC1, (uchar*)&(stages[0]) ));
         trees_mat.upload(cv::Mat(cl_trees).reshape(1,1));
         nodes_mat.upload(cv::Mat(cl_nodes).reshape(1,1));
         leaves_mat.upload(cv::Mat(cl_leaves).reshape(1,1));
diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu
index 6a778735b8..49bc1dfcd2 100644
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@@ -42,10 +42,13 @@
 
 #if !defined CUDA_DISABLER
 
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
@@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device
                                       int& bestTrainIdx1, int& bestTrainIdx2,
                                       float* s_distance, int* s_trainIdx)
         {
+        #if __CUDA_ARCH__ >= 300
+            (void) s_distance;
+            (void) s_trainIdx;
+
+            float d1, d2;
+            int i1, i2;
+
+            #pragma unroll
+            for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+            {
+                d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+                d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+                i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+                i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+
+                if (bestDistance1 < d1)
+                {
+                    if (d1 < bestDistance2)
+                    {
+                        bestDistance2 = d1;
+                        bestTrainIdx2 = i1;
+                    }
+                }
+                else
+                {
+                    bestDistance2 = bestDistance1;
+                    bestTrainIdx2 = bestTrainIdx1;
+
+                    bestDistance1 = d1;
+                    bestTrainIdx1 = i1;
+
+                    if (d2 < bestDistance2)
+                    {
+                        bestDistance2 = d2;
+                        bestTrainIdx2 = i2;
+                    }
+                }
+            }
+        #else
             float myBestDistance1 = numeric_limits<float>::max();
             float myBestDistance2 = numeric_limits<float>::max();
             int myBestTrainIdx1 = -1;
@@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device
 
             bestTrainIdx1 = myBestTrainIdx1;
             bestTrainIdx2 = myBestTrainIdx2;
+        #endif
         }
 
         template <int BLOCK_SIZE>
@@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device
                                        int& bestImgIdx1, int& bestImgIdx2,
                                        float* s_distance, int* s_trainIdx, int* s_imgIdx)
         {
+        #if __CUDA_ARCH__ >= 300
+            (void) s_distance;
+            (void) s_trainIdx;
+            (void) s_imgIdx;
+
+            float d1, d2;
+            int i1, i2;
+            int j1, j2;
+
+            #pragma unroll
+            for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+            {
+                d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+                d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+                i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+                i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+                j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
+                j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
+
+                if (bestDistance1 < d1)
+                {
+                    if (d1 < bestDistance2)
+                    {
+                        bestDistance2 = d1;
+                        bestTrainIdx2 = i1;
+                        bestImgIdx2 = j1;
+                    }
+                }
+                else
+                {
+                    bestDistance2 = bestDistance1;
+                    bestTrainIdx2 = bestTrainIdx1;
+                    bestImgIdx2 = bestImgIdx1;
+
+                    bestDistance1 = d1;
+                    bestTrainIdx1 = i1;
+                    bestImgIdx1 = j1;
+
+                    if (d2 < bestDistance2)
+                    {
+                        bestDistance2 = d2;
+                        bestTrainIdx2 = i2;
+                        bestImgIdx2 = j2;
+                    }
+                }
+            }
+        #else
             float myBestDistance1 = numeric_limits<float>::max();
             float myBestDistance2 = numeric_limits<float>::max();
             int myBestTrainIdx1 = -1;
@@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device
 
             bestImgIdx1 = myBestImgIdx1;
             bestImgIdx2 = myBestImgIdx2;
+        #endif
         }
 
         ///////////////////////////////////////////////////////////////////////////////
@@ -748,9 +839,8 @@ namespace cv { namespace gpu { namespace device
         template <typename Dist, typename T, typename Mask>
         void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
                               const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
-                              int cc, cudaStream_t stream)
+                              cudaStream_t stream)
         {
-            (void)cc;
             if (query.cols <= 64)
             {
                 matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
@@ -780,9 +870,8 @@ namespace cv { namespace gpu { namespace device
         template <typename Dist, typename T, typename Mask>
         void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
                               const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-                              int cc, cudaStream_t stream)
+                              cudaStream_t stream)
         {
-            (void)cc;
             if (query.cols <= 64)
             {
                 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
@@ -945,9 +1034,8 @@ namespace cv { namespace gpu { namespace device
         template <typename Dist, typename T, typename Mask>
         void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
                                     const PtrStepSzf& allDist,
-                                    int cc, cudaStream_t stream)
+                                    cudaStream_t stream)
         {
-            (void)cc;
             if (query.cols <= 64)
             {
                 calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
@@ -1005,7 +1093,7 @@ namespace cv { namespace gpu { namespace device
             s_trainIdx[threadIdx.x] = bestIdx;
             __syncthreads();
 
-            reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
+            reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
 
             if (threadIdx.x == 0)
             {
@@ -1034,7 +1122,7 @@ namespace cv { namespace gpu { namespace device
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
 
-        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream)
+        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream)
         {
             findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
         }
@@ -1045,16 +1133,16 @@ namespace cv { namespace gpu { namespace device
         template <typename Dist, typename T, typename Mask>
         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
         {
             if (k == 2)
             {
-                match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
+                match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, stream);
             }
             else
             {
-                calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
-                findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
+                calcDistanceDispatcher<Dist>(query, train, mask, allDist, stream);
+                findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
             }
         }
 
@@ -1063,105 +1151,105 @@ namespace cv { namespace gpu { namespace device
 
         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
         {
             if (mask.data)
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
             else
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
         }
 
-        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
 
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
         {
             if (mask.data)
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
             else
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
         }
 
-        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
 
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
         {
             if (mask.data)
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
             else
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
         }
 
-        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
 
         template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
         {
             if (masks.data)
-                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
             else
-                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance,  stream);
         }
 
-        template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
 
         template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
         {
             if (masks.data)
-                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
             else
-                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
         }
 
-        //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
 
         template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
         {
             if (masks.data)
-                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
             else
-                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
         }
 
-        template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
     } // namespace bf_knnmatch
 }}} // namespace cv { namespace gpu { namespace device {
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu
index f50089ed94..5e64e31bd9 100644
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@@ -42,7 +42,9 @@
 
 #if !defined CUDA_DISABLER
 
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
@@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device
             s_distance += threadIdx.y * BLOCK_SIZE;
             s_trainIdx += threadIdx.y * BLOCK_SIZE;
 
-            s_distance[threadIdx.x] = bestDistance;
-            s_trainIdx[threadIdx.x] = bestTrainIdx;
-
-            __syncthreads();
-
-            reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
+            reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
         }
 
         template <int BLOCK_SIZE>
@@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device
             s_trainIdx += threadIdx.y * BLOCK_SIZE;
             s_imgIdx   += threadIdx.y * BLOCK_SIZE;
 
-            s_distance[threadIdx.x] = bestDistance;
-            s_trainIdx[threadIdx.x] = bestTrainIdx;
-            s_imgIdx  [threadIdx.x] = bestImgIdx;
-
-            __syncthreads();
-
-            reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());
+            reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
         }
 
         ///////////////////////////////////////////////////////////////////////////////
@@ -567,9 +558,8 @@ namespace cv { namespace gpu { namespace device
         template <typename Dist, typename T, typename Mask>
         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
                              const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                             int cc, cudaStream_t stream)
+                             cudaStream_t stream)
         {
-            (void)cc;
             if (query.cols <= 64)
             {
                 matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
@@ -599,9 +589,8 @@ namespace cv { namespace gpu { namespace device
         template <typename Dist, typename T, typename Mask>
         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
                              const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                             int cc, cudaStream_t stream)
+                             cudaStream_t stream)
         {
-            (void)cc;
             if (query.cols <= 64)
             {
                 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
@@ -633,153 +622,153 @@ namespace cv { namespace gpu { namespace device
 
         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                                                const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                                               int cc, cudaStream_t stream)
+                                               cudaStream_t stream)
         {
             if (mask.data)
             {
                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
                     trainIdx, distance,
-                    cc, stream);
+                    stream);
             }
             else
             {
                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
                     trainIdx, distance,
-                    cc, stream);
+                    stream);
             }
         }
 
-        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                                                const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                                               int cc, cudaStream_t stream)
+                                               cudaStream_t stream)
         {
             if (mask.data)
             {
                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
                     trainIdx, distance,
-                    cc, stream);
+                    stream);
             }
             else
             {
                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
                     trainIdx, distance,
-                    cc, stream);
+                    stream);
             }
         }
 
-        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                                                     const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                                                    int cc, cudaStream_t stream)
+                                                    cudaStream_t stream)
         {
             if (mask.data)
             {
                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
                     trainIdx, distance,
-                    cc, stream);
+                    stream);
             }
             else
             {
                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
                     trainIdx, distance,
-                    cc, stream);
+                    stream);
             }
         }
 
-        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 
         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                                const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                                               int cc, cudaStream_t stream)
+                                                cudaStream_t stream)
         {
             if (masks.data)
             {
                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
                     trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
             }
             else
             {
                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
                     trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
             }
         }
 
-        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                                const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                                               int cc, cudaStream_t stream)
+                                               cudaStream_t stream)
         {
             if (masks.data)
             {
                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
                     trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
             }
             else
             {
                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
                     trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
             }
         }
 
-        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                                     const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                                                    int cc, cudaStream_t stream)
+                                                    cudaStream_t stream)
         {
             if (masks.data)
             {
                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
                     trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
             }
             else
             {
                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
                     trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
             }
         }
 
-        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
     } // namespace bf_match
 }}} // namespace cv { namespace gpu { namespace device {
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu
index 934b8fe84c..19ee94e331 100644
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -42,7 +42,8 @@
 
 #if !defined CUDA_DISABLER
 
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
@@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device
         __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
             PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
         {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-
             extern __shared__ int smem[];
 
             const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
@@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device
                     bestDistance.ptr(queryIdx)[ind] = distVal;
                 }
             }
-
-            #endif
         }
 
         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
@@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device
         __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
             PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
         {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-
             extern __shared__ int smem[];
 
             const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
@@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device
                     bestDistance.ptr(queryIdx)[ind] = distVal;
                 }
             }
-
-            #endif
         }
 
         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
@@ -281,9 +274,8 @@ namespace cv { namespace gpu { namespace device
         template <typename Dist, typename T, typename Mask>
         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
                              const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream)
+                             cudaStream_t stream)
         {
-            (void)cc;
             if (query.cols <= 64)
             {
                 matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
@@ -313,9 +305,8 @@ namespace cv { namespace gpu { namespace device
         template <typename Dist, typename T>
         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
                              const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream)
+                             cudaStream_t stream)
         {
-            (void)cc;
             if (query.cols <= 64)
             {
                 matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
@@ -347,126 +338,126 @@ namespace cv { namespace gpu { namespace device
 
         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
         {
             if (mask.data)
             {
                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
                     trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
             }
             else
             {
                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
                     trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
             }
         }
 
-        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
 
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
         {
             if (mask.data)
             {
                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
                     trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
             }
             else
             {
                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
                     trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
             }
         }
 
-        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
 
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
         {
             if (mask.data)
             {
                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
                     trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
             }
             else
             {
                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
                     trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
             }
         }
 
-        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
 
         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
         {
             matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
                 trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
+                stream);
         }
 
-        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
 
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
         {
             matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
                 trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
+                stream);
         }
 
-        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
 
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
         {
             matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
                 trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
+                stream);
         }
 
-        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
     } // namespace bf_radius_match
 }}} // namespace cv { namespace gpu { namespace device
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu
index 40c847547e..0fd482c41a 100644
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -42,9 +42,10 @@
 
 #if !defined CUDA_DISABLER
 
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
@@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device
                         crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
                         crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
             }
+            __device__ __forceinline__ TransformOp() {}
+            __device__ __forceinline__ TransformOp(const TransformOp&) {}
         };
 
         void call(const PtrStepSz<float3> src, const float* rot,
@@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device
                         (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
                         (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
             }
+            __device__ __forceinline__ ProjectOp() {}
+            __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
         };
 
         void call(const PtrStepSz<float3> src, const float* rot,
@@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device
             return x * x;
         }
 
+        template <int BLOCK_SIZE>
         __global__ void computeHypothesisScoresKernel(
                 const int num_points, const float3* object, const float2* image,
                 const float dist_threshold, int* g_num_inliers)
@@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device
                     ++num_inliers;
             }
 
-            extern __shared__ float s_num_inliers[];
-            s_num_inliers[threadIdx.x] = num_inliers;
-            __syncthreads();
-
-            for (int step = blockDim.x / 2; step > 0; step >>= 1)
-            {
-                if (threadIdx.x < step)
-                    s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
-                __syncthreads();
-            }
+            __shared__ int s_num_inliers[BLOCK_SIZE];
+            reduce<BLOCK_SIZE>(s_num_inliers, num_inliers, threadIdx.x, plus<int>());
 
             if (threadIdx.x == 0)
-                g_num_inliers[blockIdx.x] = s_num_inliers[0];
+                g_num_inliers[blockIdx.x] = num_inliers;
         }
 
         void computeHypothesisScores(
@@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device
 
             dim3 threads(256);
             dim3 grid(num_hypotheses);
-            int smem_size = threads.x * sizeof(float);
 
-            computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
+            computeHypothesisScoresKernel<256><<<grid, threads>>>(
                     num_points, object, image, dist_threshold, hypothesis_scores);
             cudaSafeCall( cudaGetLastError() );
 
@@ -193,4 +190,4 @@ namespace cv { namespace gpu { namespace device
 }}} // namespace cv { namespace gpu { namespace device
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu
index 3dc0486783..0a5daebaaf 100644
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -43,459 +43,451 @@
 #if !defined CUDA_DISABLER
 
 #include <utility>
-#include <algorithm>
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/emulation.hpp"
+#include "opencv2/gpu/device/transform.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace canny
+{
+    struct L1 : binary_function<int, int, float>
+    {
+        __device__ __forceinline__ float operator ()(int x, int y) const
+        {
+            return ::abs(x) + ::abs(y);
+        }
+
+        __device__ __forceinline__ L1() {}
+        __device__ __forceinline__ L1(const L1&) {}
+    };
+    struct L2 : binary_function<int, int, float>
+    {
+        __device__ __forceinline__ float operator ()(int x, int y) const
+        {
+            return ::sqrtf(x * x + y * y);
+        }
+
+        __device__ __forceinline__ L2() {}
+        __device__ __forceinline__ L2(const L2&) {}
+    };
+}
 
 namespace cv { namespace gpu { namespace device
 {
-    namespace canny
+    template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
     {
-        __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace canny
+{
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
+    struct SrcTex
+    {
+        const int xoff;
+        const int yoff;
+        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
+
+        __device__ __forceinline__ int operator ()(int y, int x) const
         {
-            __shared__ int smem[16][18];
+            return tex2D(tex_src, x + xoff, y + yoff);
+        }
+    };
 
-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+    template <class Norm> __global__
+    void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-            if (i < rows)
-            {
-                smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
-                if (threadIdx.x == 0)
-                {
-                    smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
-                    smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
-                }
-                __syncthreads();
+        if (y >= mag.rows || x >= mag.cols)
+            return;
 
-                if (j < cols)
-                {
-                    dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
-                    dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
-                }
-            }
+        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
+        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
+
+        dx(y, x) = dxVal;
+        dy(y, x) = dyVal;
+
+        mag(y, x) = norm(dxVal, dyVal);
+    }
+
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
+
+        bindTexture(&tex_src, srcWhole);
+        SrcTex src(xoff, yoff);
+
+        if (L2Grad)
+        {
+            L2 norm;
+            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+        }
+        else
+        {
+            L1 norm;
+            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
         }
 
-        void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    {
+        if (L2Grad)
         {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-            calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
+            L2 norm;
+            transform(dx, dy, mag, norm, WithOutMask(), 0);
         }
-
-        struct L1
+        else
         {
-            static __device__ __forceinline__ float calc(int x, int y)
-            {
-                return ::abs(x) + ::abs(y);
-            }
-        };
-        struct L2
-        {
-            static __device__ __forceinline__ float calc(int x, int y)
-            {
-                return ::sqrtf(x * x + y * y);
-            }
-        };
-
-        template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
-            PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
-        {
-            __shared__ int sdx[18][16];
-            __shared__ int sdy[18][16];
-
-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (j < cols)
-            {
-                sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
-                sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
-                if (threadIdx.y == 0)
-                {
-                    sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
-                    sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
-
-                    sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
-                    sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
-                }
-                __syncthreads();
-
-                if (i < rows)
-                {
-                    int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
-                    int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
-
-                    dx.ptr(i)[j] = x;
-                    dy.ptr(i)[j] = y;
-
-                    mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
-                }
-            }
+            L1 norm;
+            transform(dx, dy, mag, norm, WithOutMask(), 0);
         }
+    }
+}
 
-        void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
+    {
+        const int CANNY_SHIFT = 15;
+        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
+            return;
+
+        int dxVal = dx(y, x);
+        int dyVal = dy(y, x);
+
+        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
+        const float m = tex2D(tex_mag, x, y);
+
+        dxVal = ::abs(dxVal);
+        dyVal = ::abs(dyVal);
+
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+
+        if (m > low_thresh)
         {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+            const int tg22x = dxVal * TG22;
+            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
 
-            if (L2Grad)
-                calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+            dyVal <<= CANNY_SHIFT;
+
+            if (dyVal < tg22x)
+            {
+                if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else if(dyVal > tg67x)
+            {
+                if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
             else
-                calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall(cudaThreadSynchronize());
+            {
+                if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
         }
 
-        template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
-        {
-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+        map(y, x) = edge_type;
+    }
 
-            if (i < rows && j < cols)
-                mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
+
+        bindTexture(&tex_mag, mag);
+
+        calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    __device__ int counter = 0;
+
+    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
+    {
+        __shared__ volatile int smem[18][18];
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
+        if (threadIdx.y == 0)
+            smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
+        if (threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
+        if (threadIdx.x == 0)
+            smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1)
+            smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
+        if (threadIdx.x == 0 && threadIdx.y == 0)
+            smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
+            smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
+        if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
+
+        __syncthreads();
+
+        if (x >= map.cols || y >= map.rows)
+            return;
+
+        int n;
+
+        #pragma unroll
+        for (int k = 0; k < 16; ++k)
+        {
+            n = 0;
+
+            if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
+            {
+                n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
+                n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
+
+                n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+
+                n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
+                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+            }
+
+            if (n > 0)
+                smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
         }
 
-        void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
+
+        map(y, x) = e;
+
+        n = 0;
+
+        if (e == 2)
         {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+            n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
+            n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
 
-            if (L2Grad)
-                calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
-            else
-                calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
+            n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
 
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
+            n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
+            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
         }
 
-        //////////////////////////////////////////////////////////////////////////////////////////
-
-        #define CANNY_SHIFT 15
-        #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
-
-        __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+        if (n > 0)
         {
-            __shared__ float smem[18][18];
+            const int ind =  ::atomicAdd(&counter, 1);
+            st[ind] = make_ushort2(x, y);
+        }
+    }
 
-            const int j = blockIdx.x * 16 + threadIdx.x;
-            const int i = blockIdx.y * 16 + threadIdx.y;
+    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
+    {
+        void* counter_ptr;
+        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
 
-            const int tid = threadIdx.y * 16 + threadIdx.x;
-            const int lx = tid % 18;
-            const int ly = tid / 18;
+        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
 
-            if (ly < 14)
-                smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
 
-            if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
-                smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+        edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+
+    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
+    {
+        const int stack_size = 512;
+
+        __shared__ int s_counter;
+        __shared__ int s_ind;
+        __shared__ ushort2 s_st[stack_size];
+
+        if (threadIdx.x == 0)
+            s_counter = 0;
+
+        __syncthreads();
+
+        int ind = blockIdx.y * gridDim.x + blockIdx.x;
+
+        if (ind >= count)
+            return;
+
+        ushort2 pos = st1[ind];
+
+        if (threadIdx.x < 8)
+        {
+            pos.x += c_dx[threadIdx.x];
+            pos.y += c_dy[threadIdx.x];
+
+            if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
+            {
+                map(pos.y, pos.x) = 2;
+
+                ind = Emulation::smem::atomicAdd(&s_counter, 1);
+
+                s_st[ind] = pos;
+            }
+        }
+
+        __syncthreads();
+
+        while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
+        {
+            const int subTaskIdx = threadIdx.x >> 3;
+            const int portion = ::min(s_counter, blockDim.x >> 3);
+
+            if (subTaskIdx < portion)
+                pos = s_st[s_counter - 1 - subTaskIdx];
 
             __syncthreads();
 
-            if (i < rows && j < cols)
-            {
-                int x = dx.ptr(i)[j];
-                int y = dy.ptr(i)[j];
-                const int s = (x ^ y) < 0 ? -1 : 1;
-                const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
-
-                x = ::abs(x);
-                y = ::abs(y);
-
-                // 0 - the pixel can not belong to an edge
-                // 1 - the pixel might belong to an edge
-                // 2 - the pixel does belong to an edge
-                int edge_type = 0;
-
-                if (m > low_thresh)
-                {
-                    const int tg22x = x * TG22;
-                    const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
-
-                    y <<= CANNY_SHIFT;
-
-                    if (y < tg22x)
-                    {
-                        if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
-                            edge_type = 1 + (int)(m > high_thresh);
-                    }
-                    else if( y > tg67x )
-                    {
-                        if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
-                            edge_type = 1 + (int)(m > high_thresh);
-                    }
-                    else
-                    {
-                        if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
-                            edge_type = 1 + (int)(m > high_thresh);
-                    }
-                }
-
-                map.ptr(i + 1)[j + 1] = edge_type;
-            }
-        }
-
-        #undef CANNY_SHIFT
-        #undef TG22
-
-        void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
-        {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-            calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        //////////////////////////////////////////////////////////////////////////////////////////
-
-        __device__ unsigned int counter = 0;
-
-        __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
-        {
-            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120)
-
-            __shared__ int smem[18][18];
-
-            const int j = blockIdx.x * 16 + threadIdx.x;
-            const int i = blockIdx.y * 16 + threadIdx.y;
-
-            const int tid = threadIdx.y * 16 + threadIdx.x;
-            const int lx = tid % 18;
-            const int ly = tid / 18;
-
-            if (ly < 14)
-                smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
-
-            if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
-                smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
-
-            __syncthreads();
-
-            if (i < rows && j < cols)
-            {
-                int n;
-
-                #pragma unroll
-                for (int k = 0; k < 16; ++k)
-                {
-                    n = 0;
-
-                    if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
-                    {
-                        n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
-                        n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
-                        n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
-
-                        n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
-                        n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
-
-                        n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
-                        n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
-                        n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
-                    }
-
-                    if (n > 0)
-                        smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
-                }
-
-                const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
-
-                map.ptr(i + 1)[j + 1] = e;
-
-                n = 0;
-
-                if (e == 2)
-                {
-                    n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
-                    n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
-                    n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
-
-                    n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
-                    n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
-
-                    n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
-                    n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
-                    n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
-                }
-
-                if (n > 0)
-                {
-                    const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
-                    st[ind] = make_ushort2(j + 1, i + 1);
-                }
-            }
-
-            #endif
-        }
-
-        void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
-        {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
-
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-            edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
-        __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
-
-        __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
-        {
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120
-
-            const int stack_size = 512;
-
-            __shared__ unsigned int s_counter;
-            __shared__ unsigned int s_ind;
-            __shared__ ushort2 s_st[stack_size];
-
             if (threadIdx.x == 0)
-                s_counter = 0;
+                s_counter -= portion;
+
             __syncthreads();
 
-            int ind = blockIdx.y * gridDim.x + blockIdx.x;
-
-            if (ind < count)
+            if (subTaskIdx < portion)
             {
-                ushort2 pos = st1[ind];
+                pos.x += c_dx[threadIdx.x & 7];
+                pos.y += c_dy[threadIdx.x & 7];
 
-                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+                if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
                 {
-                    if (threadIdx.x < 8)
-                    {
-                        pos.x += c_dx[threadIdx.x];
-                        pos.y += c_dy[threadIdx.x];
+                    map(pos.y, pos.x) = 2;
 
-                        if (map.ptr(pos.y)[pos.x] == 1)
-                        {
-                            map.ptr(pos.y)[pos.x] = 2;
+                    ind = Emulation::smem::atomicAdd(&s_counter, 1);
 
-                            ind = atomicInc(&s_counter, (unsigned int)(-1));
-
-                            s_st[ind] = pos;
-                        }
-                    }
-                    __syncthreads();
-
-                    while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
-                    {
-                        const int subTaskIdx = threadIdx.x >> 3;
-                        const int portion = ::min(s_counter, blockDim.x >> 3);
-
-                        pos.x = pos.y = 0;
-
-                        if (subTaskIdx < portion)
-                            pos = s_st[s_counter - 1 - subTaskIdx];
-                        __syncthreads();
-
-                        if (threadIdx.x == 0)
-                            s_counter -= portion;
-                        __syncthreads();
-
-                        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
-                        {
-                            pos.x += c_dx[threadIdx.x & 7];
-                            pos.y += c_dy[threadIdx.x & 7];
-
-                            if (map.ptr(pos.y)[pos.x] == 1)
-                            {
-                                map.ptr(pos.y)[pos.x] = 2;
-
-                                ind = atomicInc(&s_counter, (unsigned int)(-1));
-
-                                s_st[ind] = pos;
-                            }
-                        }
-                        __syncthreads();
-                    }
-
-                    if (s_counter > 0)
-                    {
-                        if (threadIdx.x == 0)
-                        {
-                            ind = atomicAdd(&counter, s_counter);
-                            s_ind = ind - s_counter;
-                        }
-                        __syncthreads();
-
-                        ind = s_ind;
-
-                        for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
-                        {
-                            st2[ind + i] = s_st[i];
-                        }
-                    }
+                    s_st[ind] = pos;
                 }
             }
 
-            #endif
+            __syncthreads();
         }
 
-        void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
+        if (s_counter > 0)
         {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-
-            unsigned int count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
-
-            while (count > 0)
+            if (threadIdx.x == 0)
             {
-                cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
-
-                dim3 block(128, 1, 1);
-                dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
-                edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
-
-                std::swap(st1, st2);
+                ind = ::atomicAdd(&counter, s_counter);
+                s_ind = ind - s_counter;
             }
+
+            __syncthreads();
+
+            ind = s_ind;
+
+            for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
+                st2[ind + i] = s_st[i];
         }
+    }
 
-        __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
+    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
+    {
+        void* counter_ptr;
+        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
+
+        int count;
+        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+        while (count > 0)
         {
-            const int j = blockIdx.x * 16 + threadIdx.x;
-            const int i = blockIdx.y * 16 + threadIdx.y;
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
 
-            if (i < rows && j < cols)
-                dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
-        }
+            const dim3 block(128);
+            const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
 
-        void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
-        {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-            getEdges<<<grid, block>>>(map, dst, rows, cols);
+            edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
             cudaSafeCall( cudaGetLastError() );
 
             cudaSafeCall( cudaDeviceSynchronize() );
+
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            std::swap(st1, st2);
         }
-    } // namespace canny
-}}} // namespace cv { namespace gpu { namespace device
+    }
+}
 
+//////////////////////////////////////////////////////////////////////////////////////////
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+namespace canny
+{
+    struct GetEdges : unary_function<int, uchar>
+    {
+        __device__ __forceinline__ uchar operator ()(int e) const
+        {
+            return (uchar)(-(e >> 1));
+        }
+
+        __device__ __forceinline__ GetEdges() {}
+        __device__ __forceinline__ GetEdges(const GetEdges&) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace canny
+{
+    void getEdges(PtrStepSzi map, PtrStepSzb dst)
+    {
+        transform(map, dst, GetEdges(), WithOutMask(), 0);
+    }
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/ccomponetns.cu b/modules/gpu/src/cuda/ccomponetns.cu
index 62e81376aa..c094e08c0e 100644
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@@ -497,6 +497,7 @@ namespace cv { namespace gpu { namespace device
 
         void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
         {
+            (void) flags;
             dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
             dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
 
@@ -529,4 +530,4 @@ namespace cv { namespace gpu { namespace device
     }
 } } }
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.0.cu b/modules/gpu/src/cuda/column_filter.0.cu
new file mode 100644
index 0000000000..c35c6ee64d
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.0.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, uchar>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.1.cu b/modules/gpu/src/cuda/column_filter.1.cu
new file mode 100644
index 0000000000..9a2d6a0427
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.1.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.10.cu b/modules/gpu/src/cuda/column_filter.10.cu
new file mode 100644
index 0000000000..41e35bc1c6
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.10.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, unsigned short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.11.cu b/modules/gpu/src/cuda/column_filter.11.cu
new file mode 100644
index 0000000000..981208a68b
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.11.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, ushort3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.12.cu b/modules/gpu/src/cuda/column_filter.12.cu
new file mode 100644
index 0000000000..13d2e60023
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.12.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, ushort4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.13.cu b/modules/gpu/src/cuda/column_filter.13.cu
new file mode 100644
index 0000000000..09f6484af4
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.13.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, int3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.14.cu b/modules/gpu/src/cuda/column_filter.14.cu
new file mode 100644
index 0000000000..901ab03011
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.14.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, int4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.2.cu b/modules/gpu/src/cuda/column_filter.2.cu
new file mode 100644
index 0000000000..05ee01c763
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.2.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.3.cu b/modules/gpu/src/cuda/column_filter.3.cu
new file mode 100644
index 0000000000..1bf49219f9
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.3.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.4.cu b/modules/gpu/src/cuda/column_filter.4.cu
new file mode 100644
index 0000000000..bec7a085a0
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.4.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, int>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.5.cu b/modules/gpu/src/cuda/column_filter.5.cu
new file mode 100644
index 0000000000..8194ee39aa
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.5.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.6.cu b/modules/gpu/src/cuda/column_filter.6.cu
new file mode 100644
index 0000000000..d8fc49be68
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.6.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.7.cu b/modules/gpu/src/cuda/column_filter.7.cu
new file mode 100644
index 0000000000..534bd821ef
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.7.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.8.cu b/modules/gpu/src/cuda/column_filter.8.cu
new file mode 100644
index 0000000000..38e70e772e
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.8.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.9.cu b/modules/gpu/src/cuda/column_filter.9.cu
new file mode 100644
index 0000000000..5b58345820
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.9.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, short4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu
deleted file mode 100644
index af7369ad5e..0000000000
--- a/modules/gpu/src/cuda/column_filter.cu
+++ /dev/null
@@ -1,391 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/static_check.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace column_filter
-    {
-        #define MAX_KERNEL_SIZE 32
-
-        __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
-        void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
-        {
-            if (stream == 0)
-                cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-            else
-                cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-        }
-
-        template <int KSIZE, typename T, typename D, typename B>
-        __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
-        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
-                const int BLOCK_DIM_X = 16;
-                const int BLOCK_DIM_Y = 16;
-                const int PATCH_PER_BLOCK = 4;
-                const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
-            #else
-                const int BLOCK_DIM_X = 16;
-                const int BLOCK_DIM_Y = 8;
-                const int PATCH_PER_BLOCK = 2;
-                const int HALO_SIZE = 2;
-            #endif
-
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-            __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
-
-            const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
-
-            if (x >= src.cols)
-                return;
-
-            const T* src_col = src.ptr() + x;
-
-            const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
-
-            if (blockIdx.y > 0)
-            {
-                //Upper halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
-            }
-            else
-            {
-                //Upper halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
-            }
-
-            if (blockIdx.y + 2 < gridDim.y)
-            {
-                //Main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
-
-                //Lower halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
-            }
-            else
-            {
-                //Main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
-
-                //Lower halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
-            }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-            {
-                const int y = yStart + j * BLOCK_DIM_Y;
-
-                if (y < src.rows)
-                {
-                    sum_t sum = VecTraits<sum_t>::all(0);
-
-                    #pragma unroll
-                    for (int k = 0; k < KSIZE; ++k)
-                        sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
-
-                    dst(y, x) = saturate_cast<D>(sum);
-                }
-            }
-        }
-
-        template <int KSIZE, typename T, typename D, template<typename> class B>
-        void linearColumnFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
-        {
-            int BLOCK_DIM_X;
-            int BLOCK_DIM_Y;
-            int PATCH_PER_BLOCK;
-
-            if (cc >= 20)
-            {
-                BLOCK_DIM_X = 16;
-                BLOCK_DIM_Y = 16;
-                PATCH_PER_BLOCK = 4;
-            }
-            else
-            {
-                BLOCK_DIM_X = 16;
-                BLOCK_DIM_Y = 8;
-                PATCH_PER_BLOCK = 2;
-            }
-
-            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-            const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
-
-            B<T> brd(src.rows);
-
-            linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T, typename D>
-        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
-
-            static const caller_t callers[5][33] =
-            {
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 2, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 3, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 4, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 5, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 6, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 7, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 8, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 9, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<10, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<11, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<12, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<13, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<14, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<15, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<16, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<17, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<18, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<19, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<20, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<21, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<22, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<23, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<24, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<25, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<26, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<27, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<28, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<29, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<30, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<31, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<32, T, D, BrdColReflect101>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 2, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 3, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 4, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 5, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 6, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 7, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 8, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 9, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<10, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<11, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<12, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<13, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<14, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<15, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<16, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<17, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<18, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<19, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<20, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<21, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<22, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<23, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<24, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<25, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<26, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<27, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<28, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<29, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<30, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<31, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<32, T, D, BrdColReplicate>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 2, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 3, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 4, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 5, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 6, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 7, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 8, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 9, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<10, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<11, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<12, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<13, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<14, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<15, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<16, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<17, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<18, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<19, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<20, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<21, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<22, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<23, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<24, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<25, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<26, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<27, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<28, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<29, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<30, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<31, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<32, T, D, BrdColConstant>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 2, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 3, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 4, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 5, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 6, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 7, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 8, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 9, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<10, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<11, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<12, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<13, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<14, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<15, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<16, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<17, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<18, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<19, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<20, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<21, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<22, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<23, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<24, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<25, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<26, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<27, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<28, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<29, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<30, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<31, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<32, T, D, BrdColReflect>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 2, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 3, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 4, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 5, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 6, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 7, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 8, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 9, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<10, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<11, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<12, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<13, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<14, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<15, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<16, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<17, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<18, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<19, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<20, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<21, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<22, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<23, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<24, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<25, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<26, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<27, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<28, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<29, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<30, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<31, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<32, T, D, BrdColWrap>
-                }
-            };
-
-            loadKernel(kernel, ksize, stream);
-
-            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
-        }
-
-        template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float , int   >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-    } // namespace column_filter
-}}} // namespace cv { namespace gpu { namespace device
-
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.h b/modules/gpu/src/cuda/column_filter.h
new file mode 100644
index 0000000000..52b9103393
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.h
@@ -0,0 +1,373 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace column_filter
+{
+    #define MAX_KERNEL_SIZE 32
+
+    __constant__ float c_kernel[MAX_KERNEL_SIZE];
+
+    template <int KSIZE, typename T, typename D, typename B>
+    __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+    {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+            const int BLOCK_DIM_X = 16;
+            const int BLOCK_DIM_Y = 16;
+            const int PATCH_PER_BLOCK = 4;
+            const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
+        #else
+            const int BLOCK_DIM_X = 16;
+            const int BLOCK_DIM_Y = 8;
+            const int PATCH_PER_BLOCK = 2;
+            const int HALO_SIZE = 2;
+        #endif
+
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+        __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
+
+        const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
+
+        if (x >= src.cols)
+            return;
+
+        const T* src_col = src.ptr() + x;
+
+        const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
+
+        if (blockIdx.y > 0)
+        {
+            //Upper halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
+        }
+        else
+        {
+            //Upper halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
+        }
+
+        if (blockIdx.y + 2 < gridDim.y)
+        {
+            //Main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
+
+            //Lower halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
+        }
+        else
+        {
+            //Main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
+
+            //Lower halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
+        }
+
+        __syncthreads();
+
+        #pragma unroll
+        for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+        {
+            const int y = yStart + j * BLOCK_DIM_Y;
+
+            if (y < src.rows)
+            {
+                sum_t sum = VecTraits<sum_t>::all(0);
+
+                #pragma unroll
+                for (int k = 0; k < KSIZE; ++k)
+                    sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
+
+                dst(y, x) = saturate_cast<D>(sum);
+            }
+        }
+    }
+
+    template <int KSIZE, typename T, typename D, template<typename> class B>
+    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+    {
+        int BLOCK_DIM_X;
+        int BLOCK_DIM_Y;
+        int PATCH_PER_BLOCK;
+
+        if (cc >= 20)
+        {
+            BLOCK_DIM_X = 16;
+            BLOCK_DIM_Y = 16;
+            PATCH_PER_BLOCK = 4;
+        }
+        else
+        {
+            BLOCK_DIM_X = 16;
+            BLOCK_DIM_Y = 8;
+            PATCH_PER_BLOCK = 2;
+        }
+
+        const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+        const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
+
+        B<T> brd(src.rows);
+
+        linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+namespace filter
+{
+    template <typename T, typename D>
+    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+
+        static const caller_t callers[5][33] =
+        {
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColReflect101>,
+                column_filter::caller< 2, T, D, BrdColReflect101>,
+                column_filter::caller< 3, T, D, BrdColReflect101>,
+                column_filter::caller< 4, T, D, BrdColReflect101>,
+                column_filter::caller< 5, T, D, BrdColReflect101>,
+                column_filter::caller< 6, T, D, BrdColReflect101>,
+                column_filter::caller< 7, T, D, BrdColReflect101>,
+                column_filter::caller< 8, T, D, BrdColReflect101>,
+                column_filter::caller< 9, T, D, BrdColReflect101>,
+                column_filter::caller<10, T, D, BrdColReflect101>,
+                column_filter::caller<11, T, D, BrdColReflect101>,
+                column_filter::caller<12, T, D, BrdColReflect101>,
+                column_filter::caller<13, T, D, BrdColReflect101>,
+                column_filter::caller<14, T, D, BrdColReflect101>,
+                column_filter::caller<15, T, D, BrdColReflect101>,
+                column_filter::caller<16, T, D, BrdColReflect101>,
+                column_filter::caller<17, T, D, BrdColReflect101>,
+                column_filter::caller<18, T, D, BrdColReflect101>,
+                column_filter::caller<19, T, D, BrdColReflect101>,
+                column_filter::caller<20, T, D, BrdColReflect101>,
+                column_filter::caller<21, T, D, BrdColReflect101>,
+                column_filter::caller<22, T, D, BrdColReflect101>,
+                column_filter::caller<23, T, D, BrdColReflect101>,
+                column_filter::caller<24, T, D, BrdColReflect101>,
+                column_filter::caller<25, T, D, BrdColReflect101>,
+                column_filter::caller<26, T, D, BrdColReflect101>,
+                column_filter::caller<27, T, D, BrdColReflect101>,
+                column_filter::caller<28, T, D, BrdColReflect101>,
+                column_filter::caller<29, T, D, BrdColReflect101>,
+                column_filter::caller<30, T, D, BrdColReflect101>,
+                column_filter::caller<31, T, D, BrdColReflect101>,
+                column_filter::caller<32, T, D, BrdColReflect101>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColReplicate>,
+                column_filter::caller< 2, T, D, BrdColReplicate>,
+                column_filter::caller< 3, T, D, BrdColReplicate>,
+                column_filter::caller< 4, T, D, BrdColReplicate>,
+                column_filter::caller< 5, T, D, BrdColReplicate>,
+                column_filter::caller< 6, T, D, BrdColReplicate>,
+                column_filter::caller< 7, T, D, BrdColReplicate>,
+                column_filter::caller< 8, T, D, BrdColReplicate>,
+                column_filter::caller< 9, T, D, BrdColReplicate>,
+                column_filter::caller<10, T, D, BrdColReplicate>,
+                column_filter::caller<11, T, D, BrdColReplicate>,
+                column_filter::caller<12, T, D, BrdColReplicate>,
+                column_filter::caller<13, T, D, BrdColReplicate>,
+                column_filter::caller<14, T, D, BrdColReplicate>,
+                column_filter::caller<15, T, D, BrdColReplicate>,
+                column_filter::caller<16, T, D, BrdColReplicate>,
+                column_filter::caller<17, T, D, BrdColReplicate>,
+                column_filter::caller<18, T, D, BrdColReplicate>,
+                column_filter::caller<19, T, D, BrdColReplicate>,
+                column_filter::caller<20, T, D, BrdColReplicate>,
+                column_filter::caller<21, T, D, BrdColReplicate>,
+                column_filter::caller<22, T, D, BrdColReplicate>,
+                column_filter::caller<23, T, D, BrdColReplicate>,
+                column_filter::caller<24, T, D, BrdColReplicate>,
+                column_filter::caller<25, T, D, BrdColReplicate>,
+                column_filter::caller<26, T, D, BrdColReplicate>,
+                column_filter::caller<27, T, D, BrdColReplicate>,
+                column_filter::caller<28, T, D, BrdColReplicate>,
+                column_filter::caller<29, T, D, BrdColReplicate>,
+                column_filter::caller<30, T, D, BrdColReplicate>,
+                column_filter::caller<31, T, D, BrdColReplicate>,
+                column_filter::caller<32, T, D, BrdColReplicate>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColConstant>,
+                column_filter::caller< 2, T, D, BrdColConstant>,
+                column_filter::caller< 3, T, D, BrdColConstant>,
+                column_filter::caller< 4, T, D, BrdColConstant>,
+                column_filter::caller< 5, T, D, BrdColConstant>,
+                column_filter::caller< 6, T, D, BrdColConstant>,
+                column_filter::caller< 7, T, D, BrdColConstant>,
+                column_filter::caller< 8, T, D, BrdColConstant>,
+                column_filter::caller< 9, T, D, BrdColConstant>,
+                column_filter::caller<10, T, D, BrdColConstant>,
+                column_filter::caller<11, T, D, BrdColConstant>,
+                column_filter::caller<12, T, D, BrdColConstant>,
+                column_filter::caller<13, T, D, BrdColConstant>,
+                column_filter::caller<14, T, D, BrdColConstant>,
+                column_filter::caller<15, T, D, BrdColConstant>,
+                column_filter::caller<16, T, D, BrdColConstant>,
+                column_filter::caller<17, T, D, BrdColConstant>,
+                column_filter::caller<18, T, D, BrdColConstant>,
+                column_filter::caller<19, T, D, BrdColConstant>,
+                column_filter::caller<20, T, D, BrdColConstant>,
+                column_filter::caller<21, T, D, BrdColConstant>,
+                column_filter::caller<22, T, D, BrdColConstant>,
+                column_filter::caller<23, T, D, BrdColConstant>,
+                column_filter::caller<24, T, D, BrdColConstant>,
+                column_filter::caller<25, T, D, BrdColConstant>,
+                column_filter::caller<26, T, D, BrdColConstant>,
+                column_filter::caller<27, T, D, BrdColConstant>,
+                column_filter::caller<28, T, D, BrdColConstant>,
+                column_filter::caller<29, T, D, BrdColConstant>,
+                column_filter::caller<30, T, D, BrdColConstant>,
+                column_filter::caller<31, T, D, BrdColConstant>,
+                column_filter::caller<32, T, D, BrdColConstant>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColReflect>,
+                column_filter::caller< 2, T, D, BrdColReflect>,
+                column_filter::caller< 3, T, D, BrdColReflect>,
+                column_filter::caller< 4, T, D, BrdColReflect>,
+                column_filter::caller< 5, T, D, BrdColReflect>,
+                column_filter::caller< 6, T, D, BrdColReflect>,
+                column_filter::caller< 7, T, D, BrdColReflect>,
+                column_filter::caller< 8, T, D, BrdColReflect>,
+                column_filter::caller< 9, T, D, BrdColReflect>,
+                column_filter::caller<10, T, D, BrdColReflect>,
+                column_filter::caller<11, T, D, BrdColReflect>,
+                column_filter::caller<12, T, D, BrdColReflect>,
+                column_filter::caller<13, T, D, BrdColReflect>,
+                column_filter::caller<14, T, D, BrdColReflect>,
+                column_filter::caller<15, T, D, BrdColReflect>,
+                column_filter::caller<16, T, D, BrdColReflect>,
+                column_filter::caller<17, T, D, BrdColReflect>,
+                column_filter::caller<18, T, D, BrdColReflect>,
+                column_filter::caller<19, T, D, BrdColReflect>,
+                column_filter::caller<20, T, D, BrdColReflect>,
+                column_filter::caller<21, T, D, BrdColReflect>,
+                column_filter::caller<22, T, D, BrdColReflect>,
+                column_filter::caller<23, T, D, BrdColReflect>,
+                column_filter::caller<24, T, D, BrdColReflect>,
+                column_filter::caller<25, T, D, BrdColReflect>,
+                column_filter::caller<26, T, D, BrdColReflect>,
+                column_filter::caller<27, T, D, BrdColReflect>,
+                column_filter::caller<28, T, D, BrdColReflect>,
+                column_filter::caller<29, T, D, BrdColReflect>,
+                column_filter::caller<30, T, D, BrdColReflect>,
+                column_filter::caller<31, T, D, BrdColReflect>,
+                column_filter::caller<32, T, D, BrdColReflect>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColWrap>,
+                column_filter::caller< 2, T, D, BrdColWrap>,
+                column_filter::caller< 3, T, D, BrdColWrap>,
+                column_filter::caller< 4, T, D, BrdColWrap>,
+                column_filter::caller< 5, T, D, BrdColWrap>,
+                column_filter::caller< 6, T, D, BrdColWrap>,
+                column_filter::caller< 7, T, D, BrdColWrap>,
+                column_filter::caller< 8, T, D, BrdColWrap>,
+                column_filter::caller< 9, T, D, BrdColWrap>,
+                column_filter::caller<10, T, D, BrdColWrap>,
+                column_filter::caller<11, T, D, BrdColWrap>,
+                column_filter::caller<12, T, D, BrdColWrap>,
+                column_filter::caller<13, T, D, BrdColWrap>,
+                column_filter::caller<14, T, D, BrdColWrap>,
+                column_filter::caller<15, T, D, BrdColWrap>,
+                column_filter::caller<16, T, D, BrdColWrap>,
+                column_filter::caller<17, T, D, BrdColWrap>,
+                column_filter::caller<18, T, D, BrdColWrap>,
+                column_filter::caller<19, T, D, BrdColWrap>,
+                column_filter::caller<20, T, D, BrdColWrap>,
+                column_filter::caller<21, T, D, BrdColWrap>,
+                column_filter::caller<22, T, D, BrdColWrap>,
+                column_filter::caller<23, T, D, BrdColWrap>,
+                column_filter::caller<24, T, D, BrdColWrap>,
+                column_filter::caller<25, T, D, BrdColWrap>,
+                column_filter::caller<26, T, D, BrdColWrap>,
+                column_filter::caller<27, T, D, BrdColWrap>,
+                column_filter::caller<28, T, D, BrdColWrap>,
+                column_filter::caller<29, T, D, BrdColWrap>,
+                column_filter::caller<30, T, D, BrdColWrap>,
+                column_filter::caller<31, T, D, BrdColWrap>,
+                column_filter::caller<32, T, D, BrdColWrap>
+            }
+        };
+
+        if (stream == 0)
+            cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
+        else
+            cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
+
+        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+    }
+}
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index c61601d4f7..27fb61ff70 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -42,405 +42,875 @@
 
 #if !defined CUDA_DISABLER
 
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/functional.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 
-namespace cv { namespace gpu { namespace device
-{
-    //////////////////////////////////////////////////////////////////////////
-    // add
+using namespace cv::gpu;
+using namespace cv::gpu::device;
 
-    template <typename T, typename D> struct Add : binary_function<T, T, D>
+namespace arithm
+{
+    template <size_t src_size, size_t dst_size> struct ArithmFuncTraits
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 1 };
+    };
+
+    template <> struct ArithmFuncTraits<1, 1>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<1, 2>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<1, 4>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <> struct ArithmFuncTraits<2, 1>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<2, 2>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<2, 4>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <> struct ArithmFuncTraits<4, 1>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<4, 2>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<4, 4>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}
+
+//////////////////////////////////////////////////////////////////////////
+// addMat
+
+namespace arithm
+{
+    template <typename T, typename D> struct VAdd4;
+    template <> struct VAdd4<uint, uint> : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd4() {}
+        __device__ __forceinline__ VAdd4(const VAdd4<uint, uint>& other) {}
+    };
+    template <> struct VAdd4<int, uint> : binary_function<int, int, uint>
+    {
+        __device__ __forceinline__ uint operator ()(int a, int b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd4() {}
+        __device__ __forceinline__ VAdd4(const VAdd4<int, uint>& other) {}
+    };
+    template <> struct VAdd4<uint, int> : binary_function<uint, uint, int>
+    {
+        __device__ __forceinline__ int operator ()(uint a, uint b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd4() {}
+        __device__ __forceinline__ VAdd4(const VAdd4<uint, int>& other) {}
+    };
+    template <> struct VAdd4<int, int> : binary_function<int, int, int>
+    {
+        __device__ __forceinline__ int operator ()(int a, int b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd4() {}
+        __device__ __forceinline__ VAdd4(const VAdd4<int, int>& other) {}
+    };
+
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct VAdd2;
+    template <> struct VAdd2<uint, uint> : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd2() {}
+        __device__ __forceinline__ VAdd2(const VAdd2<uint, uint>& other) {}
+    };
+    template <> struct VAdd2<uint, int> : binary_function<uint, uint, int>
+    {
+        __device__ __forceinline__ int operator ()(uint a, uint b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd2() {}
+        __device__ __forceinline__ VAdd2(const VAdd2<uint, int>& other) {}
+    };
+    template <> struct VAdd2<int, uint> : binary_function<int, int, uint>
+    {
+        __device__ __forceinline__ uint operator ()(int a, int b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd2() {}
+        __device__ __forceinline__ VAdd2(const VAdd2<int, uint>& other) {}
+    };
+    template <> struct VAdd2<int, int> : binary_function<int, int, int>
+    {
+        __device__ __forceinline__ int operator ()(int a, int b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd2() {}
+        __device__ __forceinline__ VAdd2(const VAdd2<int, int>& other) {}
+    };
+
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct AddMat : binary_function<T, T, D>
     {
         __device__ __forceinline__ D operator ()(T a, T b) const
         {
             return saturate_cast<D>(a + b);
         }
+
+        __device__ __forceinline__ AddMat() {}
+        __device__ __forceinline__ AddMat(const AddMat& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAdd4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
     };
 
-    template <> struct TransformFunctorTraits< Add<ushort, ushort> > : DefaultTransformFunctorTraits< Add<ushort, ushort> >
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAdd2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Add<short, short> > : DefaultTransformFunctorTraits< Add<short, short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Add<int, int> > : DefaultTransformFunctorTraits< Add<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Add<float, float> > : DefaultTransformFunctorTraits< Add<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
     };
 
-    template <typename T, typename D> void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream)
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::AddMat<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
     {
-        if (mask.data)
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Add<T, D>(), SingleMask(mask), stream);
-        else
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Add<T, D>(), WithOutMask(), stream);
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T, typename D>
+    void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd4<T, D>(), WithOutMask(), stream);
     }
 
-    template void add_gpu<uchar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<uchar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void vadd4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vadd4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vadd4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vadd4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void add_gpu<schar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void add_gpu<ushort, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<ushort, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<ushort, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void add_gpu<short, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<short, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void add_gpu<int, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<int, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<int, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<int, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<int, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void add_gpu<float, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<float, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void add_gpu<double, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    template <typename T, typename D> struct AddScalar : unary_function<T, D>
+    template <typename T, typename D>
+    void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        AddScalar(double val_) : val(val_) {}
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd2<T, D>(), WithOutMask(), stream);
+    }
+
+    template void vadd2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vadd2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vadd2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vadd2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <typename T, typename D>
+    void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), mask, stream);
+        else
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), WithOutMask(), stream);
+    }
+
+    template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void addMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// addScalar
+
+namespace arithm
+{
+    template <typename T, typename S, typename D> struct AddScalar : unary_function<T, D>
+    {
+        S val;
+
+        explicit AddScalar(S val_) : val(val_) {}
+
         __device__ __forceinline__ D operator ()(T a) const
         {
             return saturate_cast<D>(a + val);
         }
-        const double val;
     };
+}
 
-    template <> struct TransformFunctorTraits< AddScalar<ushort, ushort> > : DefaultTransformFunctorTraits< AddScalar<ushort, ushort>  >
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::AddScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AddScalar<short, short> > : DefaultTransformFunctorTraits< AddScalar<short, short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AddScalar<int, int> > : DefaultTransformFunctorTraits< AddScalar<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AddScalar<float, float> > : DefaultTransformFunctorTraits< AddScalar<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
     };
+}}}
 
-    template <typename T, typename D> void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream)
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
     {
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );
-        AddScalar<T, D> op(val);
+        AddScalar<T, S, D> op(static_cast<S>(val));
+
         if (mask.data)
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, SingleMask(mask), stream);
+            transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
         else
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+            transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
     }
 
-    template void add_gpu<uchar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<uchar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void add_gpu<schar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void addScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void add_gpu<ushort, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<ushort, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<ushort, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void addScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void add_gpu<short, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<short, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void addScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void add_gpu<int, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<int, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<int, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<int, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<int, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void addScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void add_gpu<float, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<float, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void addScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void add_gpu<double, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void addScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
 
-    //////////////////////////////////////////////////////////////////////////
-    // subtract
+//////////////////////////////////////////////////////////////////////////
+// subMat
 
-    template <typename T, typename D> struct Subtract : binary_function<T, T, D>
+namespace arithm
+{
+    template <typename T, typename D> struct VSub4;
+    template <> struct VSub4<uint, uint> : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub4() {}
+        __device__ __forceinline__ VSub4(const VSub4<uint, uint>& other) {}
+    };
+    template <> struct VSub4<int, uint> : binary_function<int, int, uint>
+    {
+        __device__ __forceinline__ uint operator ()(int a, int b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub4() {}
+        __device__ __forceinline__ VSub4(const VSub4<int, uint>& other) {}
+    };
+    template <> struct VSub4<uint, int> : binary_function<uint, uint, int>
+    {
+        __device__ __forceinline__ int operator ()(uint a, uint b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub4() {}
+        __device__ __forceinline__ VSub4(const VSub4<uint, int>& other) {}
+    };
+    template <> struct VSub4<int, int> : binary_function<int, int, int>
+    {
+        __device__ __forceinline__ int operator ()(int a, int b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub4() {}
+        __device__ __forceinline__ VSub4(const VSub4<int, int>& other) {}
+    };
+
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct VSub2;
+    template <> struct VSub2<uint, uint> : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub2() {}
+        __device__ __forceinline__ VSub2(const VSub2<uint, uint>& other) {}
+    };
+    template <> struct VSub2<uint, int> : binary_function<uint, uint, int>
+    {
+        __device__ __forceinline__ int operator ()(uint a, uint b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub2() {}
+        __device__ __forceinline__ VSub2(const VSub2<uint, int>& other) {}
+    };
+    template <> struct VSub2<int, uint> : binary_function<int, int, uint>
+    {
+        __device__ __forceinline__ uint operator ()(int a, int b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub2() {}
+        __device__ __forceinline__ VSub2(const VSub2<int, uint>& other) {}
+    };
+    template <> struct VSub2<int, int> : binary_function<int, int, int>
+    {
+        __device__ __forceinline__ int operator ()(int a, int b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub2() {}
+        __device__ __forceinline__ VSub2(const VSub2<int, int>& other) {}
+    };
+
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct SubMat : binary_function<T, T, D>
     {
         __device__ __forceinline__ D operator ()(T a, T b) const
         {
             return saturate_cast<D>(a - b);
         }
+
+        __device__ __forceinline__ SubMat() {}
+        __device__ __forceinline__ SubMat(const SubMat& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::VSub4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
     };
 
-    template <> struct TransformFunctorTraits< Subtract<ushort, ushort> > : DefaultTransformFunctorTraits< Subtract<ushort, ushort> >
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::VSub2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Subtract<short, short> > : DefaultTransformFunctorTraits< Subtract<short, short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Subtract<int, int> > : DefaultTransformFunctorTraits< Subtract<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Subtract<float, float> > : DefaultTransformFunctorTraits< Subtract<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
     };
 
-    template <typename T, typename D> void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream)
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::SubMat<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
     {
-        if (mask.data)
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Subtract<T, D>(), SingleMask(mask), stream);
-        else
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Subtract<T, D>(), WithOutMask(), stream);
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T, typename D>
+    void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub4<T, D>(), WithOutMask(), stream);
     }
 
-    template void subtract_gpu<uchar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<uchar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void vsub4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vsub4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vsub4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vsub4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void subtract_gpu<schar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void subtract_gpu<ushort, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<ushort, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<ushort, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void subtract_gpu<short, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<short, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void subtract_gpu<int, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<int, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<int, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<int, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<int, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void subtract_gpu<float, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<float, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void subtract_gpu<double, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    template <typename T, typename D> struct SubtractScalar : unary_function<T, D>
+    template <typename T, typename D>
+    void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        SubtractScalar(double val_) : val(val_) {}
-        __device__ __forceinline__ D operator ()(T a) const
-        {
-            return saturate_cast<D>(a - val);
-        }
-        const double val;
-    };
-
-    template <> struct TransformFunctorTraits< SubtractScalar<ushort, ushort> > : DefaultTransformFunctorTraits< SubtractScalar<ushort, ushort>  >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< SubtractScalar<short, short> > : DefaultTransformFunctorTraits< SubtractScalar<short, short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< SubtractScalar<int, int> > : DefaultTransformFunctorTraits< SubtractScalar<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< SubtractScalar<float, float> > : DefaultTransformFunctorTraits< SubtractScalar<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    template <typename T, typename D> void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream)
-    {
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );
-        SubtractScalar<T, D> op(val);
-        if (mask.data)
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, SingleMask(mask), stream);
-        else
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub2<T, D>(), WithOutMask(), stream);
     }
 
-    template void subtract_gpu<uchar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<uchar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void vsub2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vsub2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vsub2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vsub2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void subtract_gpu<schar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template <typename T, typename D>
+    void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), mask, stream);
+        else
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), WithOutMask(), stream);
+    }
 
-    //template void subtract_gpu<ushort, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<ushort, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<ushort, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void subtract_gpu<short, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<short, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void subtract_gpu<int, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<int, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<int, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<int, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<int, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void subtract_gpu<float, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<float, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void subtract_gpu<double, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //////////////////////////////////////////////////////////////////////////
-    // multiply
+    //template void subMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    struct multiply_8uc4_32f : binary_function<uint, float, uint>
+    //template void subMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// subScalar
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        AddScalar<T, S, D> op(-static_cast<S>(val));
+
+        if (mask.data)
+            transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
+        else
+            transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+    }
+
+    template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void subScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// mulMat
+
+namespace arithm
+{
+    struct Mul_8uc4_32f : binary_function<uint, float, uint>
     {
         __device__ __forceinline__ uint operator ()(uint a, float b) const
         {
@@ -453,301 +923,262 @@ namespace cv { namespace gpu { namespace device
 
             return res;
         }
+
+        __device__ __forceinline__ Mul_8uc4_32f() {}
+        __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {}
     };
 
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_8uc4_32f)
-    {
-        enum { smart_block_dim_x = 8 };
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 8 };
-    };
-
-    void multiply_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream)
-    {
-        cv::gpu::device::transform(static_cast< PtrStepSz<uint> >(src1), src2, static_cast< PtrStepSz<uint> >(dst), multiply_8uc4_32f(), WithOutMask(), stream);
-    }
-
-    struct multiply_16sc4_32f : binary_function<short4, float, short4>
+    struct Mul_16sc4_32f : binary_function<short4, float, short4>
     {
         __device__ __forceinline__ short4 operator ()(short4 a, float b) const
         {
             return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),
                                saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
         }
+
+        __device__ __forceinline__ Mul_16sc4_32f() {}
+        __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {}
     };
 
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_16sc4_32f)
+    template <typename T, typename D> struct Mul : binary_function<T, T, D>
     {
-        enum { smart_block_dim_x = 8 };
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 8 };
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return saturate_cast<D>(a * b);
+        }
+
+        __device__ __forceinline__ Mul() {}
+        __device__ __forceinline__ Mul(const Mul& other) {}
     };
 
-    void multiply_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream)
+    template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
     {
-        cv::gpu::device::transform(static_cast< PtrStepSz<short4> >(src1), src2, static_cast< PtrStepSz<short4> >(dst), multiply_16sc4_32f(), WithOutMask(), stream);
-    }
+        S scale;
+
+        explicit MulScale(S scale_) : scale(scale_) {}
 
-    template <typename T, typename D> struct Multiply : binary_function<T, T, D>
-    {
-        Multiply(float scale_) : scale(scale_) {}
         __device__ __forceinline__ D operator ()(T a, T b) const
         {
             return saturate_cast<D>(scale * a * b);
         }
-        const float scale;
     };
-    template <typename T> struct Multiply<T, double> : binary_function<T, T, double>
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits<arithm::Mul_8uc4_32f> : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
     {
-        Multiply(double scale_) : scale(scale_) {}
-        __device__ __forceinline__ double operator ()(T a, T b) const
-        {
-            return scale * a * b;
-        }
-        const double scale;
-    };
-    template <> struct Multiply<int, int> : binary_function<int, int, int>
-    {
-        Multiply(double scale_) : scale(scale_) {}
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            return saturate_cast<int>(scale * a * b);
-        }
-        const double scale;
     };
 
-    template <> struct TransformFunctorTraits< Multiply<ushort, ushort> > : DefaultTransformFunctorTraits< Multiply<ushort, ushort> >
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::Mul<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Multiply<short, short> > : DefaultTransformFunctorTraits< Multiply<short, short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Multiply<int, int> > : DefaultTransformFunctorTraits< Multiply<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Multiply<float, float> > : DefaultTransformFunctorTraits< Multiply<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
     };
 
-    template <typename T, typename D> struct MultiplyCaller
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::MulScale<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
     {
-        static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
-        {
-            Multiply<T, D> op(static_cast<float>(scale));
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
-        }
-    };
-    template <typename T> struct MultiplyCaller<T, double>
-    {
-        static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
-        {
-            cudaSafeCall( cudaSetDoubleForDevice(&scale) );
-            Multiply<T, double> op(scale);
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<double>)dst, op, WithOutMask(), stream);
-        }
-    };
-    template <> struct MultiplyCaller<int, int>
-    {
-        static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
-        {
-            cudaSafeCall( cudaSetDoubleForDevice(&scale) );
-            Multiply<int, int> op(scale);
-            cv::gpu::device::transform((PtrStepSz<int>)src1, (PtrStepSz<int>)src2, (PtrStepSz<int>)dst, op, WithOutMask(), stream);
-        }
     };
+}}}
 
-    template <typename T, typename D> void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
+namespace arithm
+{
+    void mulMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
     {
-        MultiplyCaller<T, D>::call(src1, src2, dst, scale, stream);
+        transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream);
     }
 
-    template void multiply_gpu<uchar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<uchar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<schar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<ushort, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<ushort, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<ushort, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<short, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<short, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<int, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<int, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<int, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<int, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<int, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<float, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<float, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<double, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    template <typename T, typename D> struct MultiplyScalar : unary_function<T, D>
+    void mulMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
     {
-        MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {}
+        transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream);
+    }
+
+    template <typename T, typename S, typename D>
+    void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
+    {
+        if (scale == 1)
+        {
+            Mul<T, D> op;
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+        else
+        {
+            MulScale<T, S, D> op(static_cast<S>(scale));
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+    }
+
+    template void mulMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    template void mulMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// mulScalar
+
+namespace arithm
+{
+    template <typename T, typename S, typename D> struct MulScalar : unary_function<T, D>
+    {
+        S val;
+
+        explicit MulScalar(S val_) : val(val_) {}
+
         __device__ __forceinline__ D operator ()(T a) const
         {
-            return saturate_cast<D>(scale * a * val);
+            return saturate_cast<D>(a * val);
         }
-        const double val;
-        const double scale;
     };
+}
 
-    template <> struct TransformFunctorTraits< MultiplyScalar<ushort, ushort> > : DefaultTransformFunctorTraits< MultiplyScalar<ushort, ushort> >
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::MulScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< MultiplyScalar<short, short> > : DefaultTransformFunctorTraits< MultiplyScalar<short, short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< MultiplyScalar<int, int> > : DefaultTransformFunctorTraits< MultiplyScalar<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< MultiplyScalar<float, float> > : DefaultTransformFunctorTraits< MultiplyScalar<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
     };
+}}}
 
-    template <typename T, typename D> void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream)
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
     {
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );
-        cudaSafeCall( cudaSetDoubleForDevice(&scale) );
-        MultiplyScalar<T, D> op(val, scale);
-        cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+        MulScalar<T, S, D> op(static_cast<S>(val));
+        transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
     }
 
-    template void multiply_gpu<uchar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<uchar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    template void mulScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void multiply_gpu<schar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    template void mulScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void multiply_gpu<ushort, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<ushort, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<ushort, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void mulScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void multiply_gpu<short, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<short, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void mulScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void multiply_gpu<int, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<int, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<int, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<int, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<int, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void mulScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void multiply_gpu<float, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<float, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void mulScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void multiply_gpu<double, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void mulScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+}
 
-    //////////////////////////////////////////////////////////////////////////
-    // divide
+//////////////////////////////////////////////////////////////////////////
+// divMat
 
-    struct divide_8uc4_32f : binary_function<uchar4, float, uchar4>
+namespace arithm
+{
+    struct Div_8uc4_32f : binary_function<uint, float, uint>
     {
-        __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const
+        __device__ __forceinline__ uint operator ()(uint a, float b) const
         {
-            return b != 0 ? make_uchar4(saturate_cast<uchar>(a.x / b), saturate_cast<uchar>(a.y / b),
-                                        saturate_cast<uchar>(a.z / b), saturate_cast<uchar>(a.w / b))
-                          : make_uchar4(0,0,0,0);
+            uint res = 0;
+
+            if (b != 0)
+            {
+                b = 1.0f / b;
+                res |= (saturate_cast<uchar>((0xffu & (a      )) * b)      );
+                res |= (saturate_cast<uchar>((0xffu & (a >>  8)) * b) <<  8);
+                res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
+                res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
+            }
+
+            return res;
         }
     };
 
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_8uc4_32f)
-    {
-        enum { smart_block_dim_x = 8 };
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 8 };
-    };
-
-    void divide_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream)
-    {
-        cv::gpu::device::transform(static_cast< PtrStepSz<uchar4> >(src1), src2, static_cast< PtrStepSz<uchar4> >(dst), divide_8uc4_32f(), WithOutMask(), stream);
-    }
-
-
-    struct divide_16sc4_32f : binary_function<short4, float, short4>
+    struct Div_16sc4_32f : binary_function<short4, float, short4>
     {
         __device__ __forceinline__ short4 operator ()(short4 a, float b) const
         {
@@ -757,586 +1188,847 @@ namespace cv { namespace gpu { namespace device
         }
     };
 
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_16sc4_32f)
+    template <typename T, typename D> struct Div : binary_function<T, T, D>
     {
-        enum { smart_block_dim_x = 8 };
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 8 };
-    };
-
-    void divide_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream)
-    {
-        cv::gpu::device::transform(static_cast< PtrStepSz<short4> >(src1), src2, static_cast< PtrStepSz<short4> >(dst), divide_16sc4_32f(), WithOutMask(), stream);
-    }
-
-    template <typename T, typename D> struct Divide : binary_function<T, T, D>
-    {
-        Divide(double scale_) : scale(scale_) {}
         __device__ __forceinline__ D operator ()(T a, T b) const
         {
-            return b != 0 ? saturate_cast<D>(a * scale / b) : 0;
+            return b != 0 ? saturate_cast<D>(a / b) : 0;
         }
-        const double scale;
+
+        __device__ __forceinline__ Div() {}
+        __device__ __forceinline__ Div(const Div& other) {}
+    };
+    template <typename T> struct Div<T, float> : binary_function<T, T, float>
+    {
+        __device__ __forceinline__ float operator ()(T a, T b) const
+        {
+            return b != 0 ? static_cast<float>(a) / b : 0;
+        }
+
+        __device__ __forceinline__ Div() {}
+        __device__ __forceinline__ Div(const Div& other) {}
+    };
+    template <typename T> struct Div<T, double> : binary_function<T, T, double>
+    {
+        __device__ __forceinline__ double operator ()(T a, T b) const
+        {
+            return b != 0 ? static_cast<double>(a) / b : 0;
+        }
+
+        __device__ __forceinline__ Div() {}
+        __device__ __forceinline__ Div(const Div& other) {}
     };
 
-    template <> struct TransformFunctorTraits< Divide<ushort, ushort> > : DefaultTransformFunctorTraits< Divide<ushort, ushort> >
+    template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
+        S scale;
+
+        explicit DivScale(S scale_) : scale(scale_) {}
+
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return b != 0 ? saturate_cast<D>(scale * a / b) : 0;
+        }
     };
-    template <> struct TransformFunctorTraits< Divide<short, short> > : DefaultTransformFunctorTraits< Divide<short, short> >
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits<arithm::Div_8uc4_32f> : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Divide<int, int> > : DefaultTransformFunctorTraits< Divide<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Divide<float, float> > : DefaultTransformFunctorTraits< Divide<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
     };
 
-    template <typename T, typename D> void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::Div<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
     {
-        cudaSafeCall( cudaSetDoubleForDevice(&scale) );
-        Divide<T, D> op(scale);
-        cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+    };
+
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScale<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    void divMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream);
     }
 
-    template void divide_gpu<uchar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<uchar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void divide_gpu<schar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void divide_gpu<ushort, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<ushort, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<ushort, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void divide_gpu<short, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<short, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void divide_gpu<int, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<int, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<int, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<int, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<int, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void divide_gpu<float, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<float, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void divide_gpu<double, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    template <typename T, typename D> struct DivideScalar : unary_function<T, D>
+    void divMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
     {
-        DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {}
+        transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream);
+    }
+
+    template <typename T, typename S, typename D>
+    void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
+    {
+        if (scale == 1)
+        {
+            Div<T, D> op;
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+        else
+        {
+            DivScale<T, S, D> op(static_cast<S>(scale));
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+    }
+
+    template void divMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    template void divMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void divMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void divMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void divMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void divMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void divMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// divScalar
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+    {
+        MulScalar<T, S, D> op(static_cast<S>(1.0 / val));
+        transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+    }
+
+    template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// divInv
+
+namespace arithm
+{
+    template <typename T, typename S, typename D> struct DivInv : unary_function<T, D>
+    {
+        S val;
+
+        explicit DivInv(S val_) : val(val_) {}
+
         __device__ __forceinline__ D operator ()(T a) const
         {
-            return saturate_cast<D>(scale * a / val);
+            return a != 0 ? saturate_cast<D>(val / a) : 0;
         }
-        const double val;
-        const double scale;
     };
+}
 
-    template <> struct TransformFunctorTraits< DivideScalar<ushort, ushort> > : DefaultTransformFunctorTraits< DivideScalar<ushort, ushort> >
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivInv<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< DivideScalar<short, short> > : DefaultTransformFunctorTraits< DivideScalar<short, short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< DivideScalar<int, int> > : DefaultTransformFunctorTraits< DivideScalar<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< DivideScalar<float, float> > : DefaultTransformFunctorTraits< DivideScalar<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
     };
+}}}
 
-    template <typename T, typename D> void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream)
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
     {
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );
-        cudaSafeCall( cudaSetDoubleForDevice(&scale) );
-        DivideScalar<T, D> op(val, scale);
-        cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+        DivInv<T, S, D> op(static_cast<S>(val));
+        transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
     }
 
-    template void divide_gpu<uchar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<uchar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    template void divInv<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divide_gpu<schar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    template void divInv<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divide_gpu<ushort, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<ushort, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<ushort, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divInv<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divide_gpu<short, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<short, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divInv<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divide_gpu<int, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<int, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<int, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<int, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<int, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divInv<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divide_gpu<float, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<float, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divInv<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divide_gpu<double, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divInv<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+}
 
-    template <typename T, typename D> struct Reciprocal : unary_function<T, D>
+//////////////////////////////////////////////////////////////////////////
+// absDiffMat
+
+namespace arithm
+{
+    template <typename T, typename D> struct VAbsDiff4;
+    template <> struct VAbsDiff4<uint, uint> : binary_function<uint, uint, uint>
     {
-        Reciprocal(double scale_) : scale(scale_) {}
-        __device__ __forceinline__ D operator ()(T a) const
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            return a != 0 ? saturate_cast<D>(scale / a) : 0;
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
         }
-        const double scale;
+
+        __device__ __forceinline__ VAbsDiff4() {}
+        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4<uint, uint>& other) {}
+    };
+    template <> struct VAbsDiff4<int, int> : binary_function<int, int, int>
+    {
+        __device__ __forceinline__ int operator ()(int a, int b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAbsDiff4() {}
+        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4<int, int>& other) {}
     };
 
-    template <> struct TransformFunctorTraits< Reciprocal<ushort, ushort> > : DefaultTransformFunctorTraits< Reciprocal<ushort, ushort> >
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct VAbsDiff2;
+    template <> struct VAbsDiff2<uint, uint> : binary_function<uint, uint, uint>
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAbsDiff2() {}
+        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2<uint, uint>& other) {}
     };
-    template <> struct TransformFunctorTraits< Reciprocal<short, short> > : DefaultTransformFunctorTraits< Reciprocal<short, short> >
+    template <> struct VAbsDiff2<int, int> : binary_function<int, int, int>
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Reciprocal<int, int> > : DefaultTransformFunctorTraits< Reciprocal<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Reciprocal<float, float> > : DefaultTransformFunctorTraits< Reciprocal<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
+        __device__ __forceinline__ int operator ()(int a, int b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAbsDiff2() {}
+        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2<int, int>& other) {}
     };
 
-    template <typename T, typename D> void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream)
+    ////////////////////////////////////
+
+    __device__ __forceinline__ int _abs(int a)
     {
-        cudaSafeCall( cudaSetDoubleForDevice(&scalar) );
-        Reciprocal<T, D> op(scalar);
-        cv::gpu::device::transform((PtrStepSz<T>)src2, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+        return ::abs(a);
+    }
+    __device__ __forceinline__ float _abs(float a)
+    {
+        return ::fabsf(a);
+    }
+    __device__ __forceinline__ double _abs(double a)
+    {
+        return ::fabs(a);
     }
 
-    template void divide_gpu<uchar, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<uchar, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<uchar, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<uchar, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<uchar, int   >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<uchar, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<uchar, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
-    //template void divide_gpu<schar, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<schar, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<schar, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<schar, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<schar, int   >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<schar, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<schar, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
-    //template void divide_gpu<ushort, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<ushort, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<ushort, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<ushort, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<ushort, int   >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<ushort, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<ushort, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
-    //template void divide_gpu<short, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<short, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<short, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<short, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<short, int   >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<short, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<short, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
-    //template void divide_gpu<int, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<int, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<int, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<int, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<int, int   >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<int, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<int, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
-    //template void divide_gpu<float, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<float, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<float, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<float, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<float, int   >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<float, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<float, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
-    //template void divide_gpu<double, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<double, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<double, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<double, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<double, int   >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<double, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<double, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
-    //////////////////////////////////////////////////////////////////////////
-    // absdiff
-
-    template <typename T> struct Absdiff : binary_function<T, T, T>
+    template <typename T> struct AbsDiffMat : binary_function<T, T, T>
     {
-        static __device__ __forceinline__ int abs(int a)
-        {
-            return ::abs(a);
-        }
-        static __device__ __forceinline__ float abs(float a)
-        {
-            return ::fabsf(a);
-        }
-        static __device__ __forceinline__ double abs(double a)
-        {
-            return ::fabs(a);
-        }
-
         __device__ __forceinline__ T operator ()(T a, T b) const
         {
-            return saturate_cast<T>(::abs(a - b));
+            return saturate_cast<T>(_abs(a - b));
         }
+
+        __device__ __forceinline__ AbsDiffMat() {}
+        __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAbsDiff4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
     };
 
-    template <> struct TransformFunctorTraits< Absdiff<ushort> > : DefaultTransformFunctorTraits< Absdiff<ushort> >
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAbsDiff2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Absdiff<short> > : DefaultTransformFunctorTraits< Absdiff<short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Absdiff<int> > : DefaultTransformFunctorTraits< Absdiff<int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Absdiff<float> > : DefaultTransformFunctorTraits< Absdiff<float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
     };
 
-    template <typename T> void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    ////////////////////////////////////
+
+    template <typename T> struct TransformFunctorTraits< arithm::AbsDiffMat<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
     {
-        cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<T>)dst, Absdiff<T>(), WithOutMask(), stream);
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff4<T, T>(), WithOutMask(), stream);
     }
 
-    //template void absdiff_gpu<uchar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<schar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    //template void absdiff_gpu<ushort>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<short >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<int   >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    //template void absdiff_gpu<float >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<double>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vabsDiff4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vabsDiff4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    template <typename T> struct AbsdiffScalar : unary_function<T, T>
+    template <typename T>
+    void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        AbsdiffScalar(double val_) : val(val_) {}
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff2<T, T>(), WithOutMask(), stream);
+    }
+
+    template void vabsDiff2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vabsDiff2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <typename T>
+    void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream);
+    }
+
+    template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// absDiffScalar
+
+namespace arithm
+{
+    template <typename T, typename S> struct AbsDiffScalar : unary_function<T, T>
+    {
+        S val;
+
+        explicit AbsDiffScalar(S val_) : val(val_) {}
+
         __device__ __forceinline__ T operator ()(T a) const
         {
-            return saturate_cast<T>(::fabs(a - val));
+            abs_func<S> f;
+            return saturate_cast<T>(f(a - val));
         }
-        double val;
     };
+}
 
-    template <> struct TransformFunctorTraits< AbsdiffScalar<ushort> > : DefaultTransformFunctorTraits< AbsdiffScalar<ushort> >
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T, typename S> struct TransformFunctorTraits< arithm::AbsDiffScalar<T, S> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AbsdiffScalar<short> > : DefaultTransformFunctorTraits< AbsdiffScalar<short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AbsdiffScalar<int> > : DefaultTransformFunctorTraits< AbsdiffScalar<int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AbsdiffScalar<float> > : DefaultTransformFunctorTraits< AbsdiffScalar<float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
     };
+}}}
 
-    template <typename T> void absdiff_gpu(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+namespace arithm
+{
+    template <typename T, typename S>
+    void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
     {
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );
-        AbsdiffScalar<T> op(val);
-        cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)dst, op, WithOutMask(), stream);
+        AbsDiffScalar<T, S> op(static_cast<S>(val));
+
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream);
     }
 
-    //template void absdiff_gpu<uchar >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<schar >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    //template void absdiff_gpu<ushort>(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<short >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<int   >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    //template void absdiff_gpu<float >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<double>(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+}
 
-    //////////////////////////////////////////////////////////////////////////////////////
-    // Compare
+//////////////////////////////////////////////////////////////////////////
+// absMat
 
-    template <template <typename> class Op, typename T>
-    struct Compare: binary_function<T, T, uchar>
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< abs_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
     {
-        __device__ __forceinline__ uchar operator()(T src1, T src2) const
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, abs_func<T>(), WithOutMask(), stream);
+    }
+
+    template void absMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// sqrMat
+
+namespace arithm
+{
+    template <typename T> struct Sqr : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(T x) const
         {
-            Op<T> op;
-            return static_cast<uchar>(static_cast<int>(op(src1, src2)) * 255);
+            return saturate_cast<T>(x * x);
+        }
+
+        __device__ __forceinline__ Sqr() {}
+        __device__ __forceinline__ Sqr(const Sqr& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< arithm::Sqr<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Sqr<T>(), WithOutMask(), stream);
+    }
+
+    template void sqrMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// sqrtMat
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< sqrt_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, sqrt_func<T>(), WithOutMask(), stream);
+    }
+
+    template void sqrtMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// logMat
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< log_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, log_func<T>(), WithOutMask(), stream);
+    }
+
+    template void logMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// expMat
+
+namespace arithm
+{
+    template <typename T> struct Exp : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(T x) const
+        {
+            exp_func<T> f;
+            return saturate_cast<T>(f(x));
+        }
+
+        __device__ __forceinline__ Exp() {}
+        __device__ __forceinline__ Exp(const Exp& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< arithm::Exp<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Exp<T>(), WithOutMask(), stream);
+    }
+
+    template void expMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// cmpMat
+
+namespace arithm
+{
+    template <class Op, typename T>
+    struct Cmp : binary_function<T, T, uchar>
+    {
+        __device__ __forceinline__ uchar operator()(T a, T b) const
+        {
+            Op op;
+            return -op(a, b);
         }
     };
+}
 
-#define IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(op, type, block_dim_y, shift) \
-    template <> struct TransformFunctorTraits< Compare<op, type> > : DefaultTransformFunctorTraits< Compare<op, type> > \
-    { \
-        enum { smart_block_dim_y = block_dim_y }; \
-        enum { smart_shift = shift }; \
+namespace cv { namespace gpu { namespace device
+{
+    template <class Op, typename T> struct TransformFunctorTraits< arithm::Cmp<Op, T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
+    {
     };
+}}}
 
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(equal_to, int, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(equal_to, float, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(not_equal_to, int, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(not_equal_to, float, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(greater, int, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(greater, float, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less, int, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less, float, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(greater_equal, int, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(greater_equal, float, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, int, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, float, 8, 4)
-
-#undef IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS
-
-    template <template <typename> class Op, typename T> void compare(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+namespace arithm
+{
+    template <template <typename> class Op, typename T>
+    void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        Compare<Op, T> op;
-        cv::gpu::device::transform(static_cast< PtrStepSz<T> >(src1), static_cast< PtrStepSz<T> >(src2), dst, op, WithOutMask(), stream);
+        Cmp<Op<T>, T> op;
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, dst, op, WithOutMask(), stream);
     }
 
-    template <typename T> void compare_eq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        compare<equal_to, T>(src1, src2, dst, stream);
+        cmpMat<equal_to, T>(src1, src2, dst, stream);
     }
-    template <typename T> void compare_ne(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        compare<not_equal_to, T>(src1, src2, dst, stream);
+        cmpMat<not_equal_to, T>(src1, src2, dst, stream);
     }
-    template <typename T> void compare_lt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        compare<less, T>(src1, src2, dst, stream);
+        cmpMat<less, T>(src1, src2, dst, stream);
     }
-    template <typename T> void compare_le(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        compare<less_equal, T>(src1, src2, dst, stream);
+        cmpMat<less_equal, T>(src1, src2, dst, stream);
     }
 
-    template void compare_eq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_ne<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_lt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_le<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+}
 
+//////////////////////////////////////////////////////////////////////////////////////
+// cmpScalar
+
+namespace arithm
+{
 #define TYPE_VEC(type, cn) typename TypeVec<type, cn>::vec_type
 
-    template <template <typename> class Op, typename T, int cn> struct CompareScalar;
-    template <template <typename> class Op, typename T>
-    struct CompareScalar<Op, T, 1>: unary_function<T, uchar>
+    template <class Op, typename T, int cn> struct CmpScalar;
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 1> : unary_function<T, uchar>
     {
         const T val;
 
-        __host__ explicit CompareScalar(T val_) : val(val_) {}
+        __host__ explicit CmpScalar(T val_) : val(val_) {}
 
         __device__ __forceinline__ uchar operator()(T src) const
         {
-            Op<T> op;
-            return static_cast<uchar>(static_cast<int>(op(src, val)) * 255);
+            Cmp<Op, T> op;
+            return op(src, val);
         }
     };
-    template <template <typename> class Op, typename T>
-    struct CompareScalar<Op, T, 2>: unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)>
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 2> : unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)>
     {
         const TYPE_VEC(T, 2) val;
 
-        __host__ explicit CompareScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
+        __host__ explicit CmpScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
 
         __device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
         {
-            Op<T> op;
-            return VecTraits<TYPE_VEC(uchar, 2)>::make(
-                        static_cast<uchar>(static_cast<int>(op(src.x, val.x)) * 255),
-                        static_cast<uchar>(static_cast<int>(op(src.y, val.y)) * 255));
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 2)>::make(op(src.x, val.x), op(src.y, val.y));
         }
     };
-    template <template <typename> class Op, typename T>
-    struct CompareScalar<Op, T, 3>: unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)>
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 3> : unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)>
     {
         const TYPE_VEC(T, 3) val;
 
-        __host__ explicit CompareScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
+        __host__ explicit CmpScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
 
         __device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
         {
-            Op<T> op;
-            return VecTraits<TYPE_VEC(uchar, 3)>::make(
-                        static_cast<uchar>(static_cast<int>(op(src.x, val.x)) * 255),
-                        static_cast<uchar>(static_cast<int>(op(src.y, val.y)) * 255),
-                        static_cast<uchar>(static_cast<int>(op(src.z, val.z)) * 255));
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 3)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z));
         }
     };
-    template <template <typename> class Op, typename T>
-    struct CompareScalar<Op, T, 4>: unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)>
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 4> : unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)>
     {
         const TYPE_VEC(T, 4) val;
 
-        __host__ explicit CompareScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
+        __host__ explicit CmpScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
 
         __device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
         {
-            Op<T> op;
-            return VecTraits<TYPE_VEC(uchar, 4)>::make(
-                        static_cast<uchar>(static_cast<int>(op(src.x, val.x)) * 255),
-                        static_cast<uchar>(static_cast<int>(op(src.y, val.y)) * 255),
-                        static_cast<uchar>(static_cast<int>(op(src.z, val.z)) * 255),
-                        static_cast<uchar>(static_cast<int>(op(src.w, val.w)) * 255));
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 4)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z), op(src.w, val.w));
         }
     };
 
 #undef TYPE_VEC
+}
 
-#define IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(op, type, block_dim_y, shift) \
-    template <> struct TransformFunctorTraits< CompareScalar<op, type, 1> > : DefaultTransformFunctorTraits< CompareScalar<op, type, 1> > \
-    { \
-        enum { smart_block_dim_y = block_dim_y }; \
-        enum { smart_shift = shift }; \
+namespace cv { namespace gpu { namespace device
+{
+    template <class Op, typename T> struct TransformFunctorTraits< arithm::CmpScalar<Op, T, 1> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
+    {
     };
+}}}
 
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(equal_to, int, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(equal_to, float, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(not_equal_to, int, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(not_equal_to, float, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(greater, int, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(greater, float, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less, int, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less, float, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(greater_equal, int, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(greater_equal, float, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, int, 8, 4)
-    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, float, 8, 4)
-
-#undef IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS
-
-    template <template <typename> class Op, typename T, int cn> void compare(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream)
+namespace arithm
+{
+    template <template <typename> class Op, typename T, int cn>
+    void cmpScalar(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream)
     {
         typedef typename TypeVec<T, cn>::vec_type src_t;
         typedef typename TypeVec<uchar, cn>::vec_type dst_t;
@@ -1344,549 +2036,624 @@ namespace cv { namespace gpu { namespace device
         T sval[] = {static_cast<T>(val[0]), static_cast<T>(val[1]), static_cast<T>(val[2]), static_cast<T>(val[3])};
         src_t val1 = VecTraits<src_t>::make(sval);
 
-        CompareScalar<Op, T, cn> op(val1);
-
-        cv::gpu::device::transform(static_cast< PtrStepSz<src_t> >(src), static_cast< PtrStepSz<dst_t> >(dst), op, WithOutMask(), stream);
+        CmpScalar<Op<T>, T, cn> op(val1);
+        transform((PtrStepSz<src_t>) src, (PtrStepSz<dst_t>) dst, op, WithOutMask(), stream);
     }
 
-    template <typename T> void compare_eq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
     {
         typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
         static const func_t funcs[] =
         {
             0,
-            compare<equal_to, T, 1>,
-            compare<equal_to, T, 2>,
-            compare<equal_to, T, 3>,
-            compare<equal_to, T, 4>
+            cmpScalar<equal_to, T, 1>,
+            cmpScalar<equal_to, T, 2>,
+            cmpScalar<equal_to, T, 3>,
+            cmpScalar<equal_to, T, 4>
         };
 
         funcs[cn](src, val, dst, stream);
     }
-    template <typename T> void compare_ne(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
     {
         typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
         static const func_t funcs[] =
         {
             0,
-            compare<not_equal_to, T, 1>,
-            compare<not_equal_to, T, 2>,
-            compare<not_equal_to, T, 3>,
-            compare<not_equal_to, T, 4>
+            cmpScalar<not_equal_to, T, 1>,
+            cmpScalar<not_equal_to, T, 2>,
+            cmpScalar<not_equal_to, T, 3>,
+            cmpScalar<not_equal_to, T, 4>
         };
 
         funcs[cn](src, val, dst, stream);
     }
-    template <typename T> void compare_lt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
     {
         typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
         static const func_t funcs[] =
         {
             0,
-            compare<less, T, 1>,
-            compare<less, T, 2>,
-            compare<less, T, 3>,
-            compare<less, T, 4>
+            cmpScalar<less, T, 1>,
+            cmpScalar<less, T, 2>,
+            cmpScalar<less, T, 3>,
+            cmpScalar<less, T, 4>
         };
 
         funcs[cn](src, val, dst, stream);
     }
-    template <typename T> void compare_le(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
     {
         typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
         static const func_t funcs[] =
         {
             0,
-            compare<less_equal, T, 1>,
-            compare<less_equal, T, 2>,
-            compare<less_equal, T, 3>,
-            compare<less_equal, T, 4>
+            cmpScalar<less_equal, T, 1>,
+            cmpScalar<less_equal, T, 2>,
+            cmpScalar<less_equal, T, 3>,
+            cmpScalar<less_equal, T, 4>
         };
 
         funcs[cn](src, val, dst, stream);
     }
-    template <typename T> void compare_gt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
     {
         typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
         static const func_t funcs[] =
         {
             0,
-            compare<greater, T, 1>,
-            compare<greater, T, 2>,
-            compare<greater, T, 3>,
-            compare<greater, T, 4>
+            cmpScalar<greater, T, 1>,
+            cmpScalar<greater, T, 2>,
+            cmpScalar<greater, T, 3>,
+            cmpScalar<greater, T, 4>
         };
 
         funcs[cn](src, val, dst, stream);
     }
-    template <typename T> void compare_ge(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
     {
         typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
         static const func_t funcs[] =
         {
             0,
-            compare<greater_equal, T, 1>,
-            compare<greater_equal, T, 2>,
-            compare<greater_equal, T, 3>,
-            compare<greater_equal, T, 4>
+            cmpScalar<greater_equal, T, 1>,
+            cmpScalar<greater_equal, T, 2>,
+            cmpScalar<greater_equal, T, 3>,
+            cmpScalar<greater_equal, T, 4>
         };
 
         funcs[cn](src, val, dst, stream);
     }
 
-    template void compare_eq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_ne<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_lt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_le<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_gt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_gt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_gt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_gt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_gt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_gt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_gt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_ge<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ge<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ge<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ge<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ge<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ge<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ge<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+}
 
-    //////////////////////////////////////////////////////////////////////////
-    // Unary bitwise logical matrix operations
+//////////////////////////////////////////////////////////////////////////////////////
+// bitMat
 
-    enum { UN_OP_NOT };
-
-    template <typename T, int opid>
-    struct UnOp;
-
-    template <typename T>
-    struct UnOp<T, UN_OP_NOT>
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< bit_not<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
     {
-        static __device__ __forceinline__ T call(T v) { return ~v; }
     };
 
-
-    template <int opid>
-    __global__ void bitwiseUnOpKernel(int rows, int width, const PtrStepb src, PtrStepb dst)
+    template <typename T> struct TransformFunctorTraits< bit_and<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
     {
-        const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+    };
 
-        if (y < rows)
+    template <typename T> struct TransformFunctorTraits< bit_or<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< bit_xor<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), mask, stream);
+        else
+            transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), mask, stream);
+        else
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), mask, stream);
+        else
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), mask, stream);
+        else
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), WithOutMask(), stream);
+    }
+
+    template void bitMatNot<uchar>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatNot<ushort>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatNot<uint>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void bitMatAnd<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatAnd<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatAnd<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void bitMatOr<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatOr<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatOr<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void bitMatXor<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatXor<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatXor<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// bitScalar
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< binder2nd< bit_and<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< binder2nd< bit_or<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< binder2nd< bit_xor<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_and<T>(), src2), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_or<T>(), src2), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream);
+    }
+
+    template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarAnd<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarOr<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarXor<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// min
+
+namespace arithm
+{
+    template <typename T> struct VMin4;
+    template <> struct VMin4<uint> : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            uchar* dst_ptr = dst.ptr(y) + x;
-            const uchar* src_ptr = src.ptr(y) + x;
-            if (x + sizeof(uint) - 1 < width)
-            {
-                *(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr);
-            }
-            else
-            {
-                const uchar* src_end = src.ptr(y) + width;
-                while (src_ptr < src_end)
-                {
-                    *dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++);
-                }
-            }
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
         }
-    }
 
-
-    template <int opid>
-    void bitwiseUnOp(int rows, int width, const PtrStepb src, PtrStepb dst,
-                     cudaStream_t stream)
+        __device__ __forceinline__ VMin4() {}
+        __device__ __forceinline__ VMin4(const VMin4& other) {}
+    };
+    template <> struct VMin4<int> : binary_function<int, int, int>
     {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(width, threads.x * sizeof(uint)),
-                  divUp(rows, threads.y));
-
-        bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-
-    template <typename T, int opid>
-    __global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStepb src,
-                                      const PtrStepb mask, PtrStepb dst)
-    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < cols && y < rows && mask.ptr(y)[x / cn])
+        __device__ __forceinline__ int operator ()(int a, int b) const
         {
-            T* dst_row = (T*)dst.ptr(y);
-            const T* src_row = (const T*)src.ptr(y);
+            int res = 0;
 
-            dst_row[x] = UnOp<T, opid>::call(src_row[x]);
+        #if __CUDA_ARCH__ >= 300
+            asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmin.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
         }
-    }
 
-
-    template <typename T, int opid>
-    void bitwiseUnOp(int rows, int cols, int cn, const PtrStepb src,
-                     const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
-    {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-        bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-
-    void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn,
-                          const PtrStepb src, PtrStepb dst, cudaStream_t stream)
-    {
-        bitwiseUnOp<UN_OP_NOT>(rows, static_cast<int>(cols * elem_size1 * cn), src, dst, stream);
-    }
-
-
-    template <typename T>
-    void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src,
-                              const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
-    {
-        bitwiseUnOp<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream);
-    }
-
-    template void bitwiseMaskNotCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskNotCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskNotCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-
-
-    //////////////////////////////////////////////////////////////////////////
-    // Binary bitwise logical matrix operations
-
-    enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR };
-
-    template <typename T, int opid>
-    struct BinOp;
-
-    template <typename T>
-    struct BinOp<T, BIN_OP_OR>
-    {
-        static __device__ __forceinline__ T call(T a, T b) { return a | b; }
+        __device__ __forceinline__ VMin4() {}
+        __device__ __forceinline__ VMin4(const VMin4& other) {}
     };
 
+    ////////////////////////////////////
 
-    template <typename T>
-    struct BinOp<T, BIN_OP_AND>
+    template <typename T> struct VMin2;
+    template <> struct VMin2<uint> : binary_function<uint, uint, uint>
     {
-        static __device__ __forceinline__ T call(T a, T b) { return a & b; }
-    };
-
-    template <typename T>
-    struct BinOp<T, BIN_OP_XOR>
-    {
-        static __device__ __forceinline__ T call(T a, T b) { return a ^ b; }
-    };
-
-
-    template <int opid>
-    __global__ void bitwiseBinOpKernel(int rows, int width, const PtrStepb src1,
-                                       const PtrStepb src2, PtrStepb dst)
-    {
-        const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (y < rows)
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            uchar* dst_ptr = dst.ptr(y) + x;
-            const uchar* src1_ptr = src1.ptr(y) + x;
-            const uchar* src2_ptr = src2.ptr(y) + x;
+            uint res = 0;
 
-            if (x + sizeof(uint) - 1 < width)
-            {
-                *(uint*)dst_ptr = BinOp<uint, opid>::call(*(uint*)src1_ptr, *(uint*)src2_ptr);
-            }
-            else
-            {
-                const uchar* src1_end = src1.ptr(y) + width;
-                while (src1_ptr < src1_end)
-                {
-                    *dst_ptr++ = BinOp<uchar, opid>::call(*src1_ptr++, *src2_ptr++);
-                }
-            }
+        #if __CUDA_ARCH__ >= 300
+            asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
         }
-    }
 
-
-    template <int opid>
-    void bitwiseBinOp(int rows, int width, const PtrStepb src1, const PtrStepb src2,
-                      PtrStepb dst, cudaStream_t stream)
+        __device__ __forceinline__ VMin2() {}
+        __device__ __forceinline__ VMin2(const VMin2& other) {}
+    };
+    template <> struct VMin2<int> : binary_function<int, int, int>
     {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));
-
-        bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-
-    template <typename T, int opid>
-    __global__ void bitwiseBinOpKernel(
-            int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
-            const PtrStepb mask, PtrStepb dst)
-    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < cols && y < rows && mask.ptr(y)[x / cn])
+        __device__ __forceinline__ int operator ()(int a, int b) const
         {
-            T* dst_row = (T*)dst.ptr(y);
-            const T* src1_row = (const T*)src1.ptr(y);
-            const T* src2_row = (const T*)src2.ptr(y);
+            int res = 0;
 
-            dst_row[x] = BinOp<T, opid>::call(src1_row[x], src2_row[x]);
+        #if __CUDA_ARCH__ >= 300
+            asm("vmin2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmin.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
         }
+
+        __device__ __forceinline__ VMin2() {}
+        __device__ __forceinline__ VMin2(const VMin2& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< arithm::VMin4<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    ////////////////////////////////////
+
+    template <typename T> struct TransformFunctorTraits< arithm::VMin2<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    ////////////////////////////////////
+
+    template <typename T> struct TransformFunctorTraits< minimum<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin4<T>(), WithOutMask(), stream);
     }
 
-
-    template <typename T, int opid>
-    void bitwiseBinOp(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
-                        const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+    template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-        bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin2<T>(), WithOutMask(), stream);
     }
 
-
-    void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
-                         const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
+    template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        bitwiseBinOp<BIN_OP_OR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
     }
 
+    template void vmin4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vmin4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    template <typename T>
-    void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
-                             const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+    template void vmin2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vmin2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        bitwiseBinOp<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
     }
 
-    template void bitwiseMaskOrCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskOrCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskOrCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+    template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+}
 
+//////////////////////////////////////////////////////////////////////////
+// max
 
-    void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
-                          const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
+namespace arithm
+{
+    template <typename T> struct VMax4;
+    template <> struct VMax4<uint> : binary_function<uint, uint, uint>
     {
-        bitwiseBinOp<BIN_OP_AND>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
-    }
-
-
-    template <typename T>
-    void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
-                              const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
-    {
-        bitwiseBinOp<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
-    }
-
-    template void bitwiseMaskAndCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskAndCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskAndCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-
-
-    void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
-                          const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
-    {
-        bitwiseBinOp<BIN_OP_XOR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
-    }
-
-
-    template <typename T>
-    void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
-                              const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
-    {
-        bitwiseBinOp<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
-    }
-
-    template void bitwiseMaskXorCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskXorCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskXorCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-
-    //////////////////////////////////////////////////////////////////////////
-    // min/max
-
-    namespace detail
-    {
-        template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F>
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-        };
-        template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F>
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VMax4() {}
+        __device__ __forceinline__ VMax4(const VMax4& other) {}
+    };
+    template <> struct VMax4<int> : binary_function<int, int, int>
+    {
+        __device__ __forceinline__ int operator ()(int a, int b) const
         {
-            enum { smart_shift = 4 };
-        };
-        template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F>
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmax.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VMax4() {}
+        __device__ __forceinline__ VMax4(const VMax4& other) {}
+    };
+
+    ////////////////////////////////////
+
+    template <typename T> struct VMax2;
+    template <> struct VMax2<uint> : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            enum { smart_block_dim_y = 4 };
-            enum { smart_shift = 4 };
-        };
-    }
+            uint res = 0;
 
-    template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> >
-    {
+        #if __CUDA_ARCH__ >= 300
+            asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VMax2() {}
+        __device__ __forceinline__ VMax2(const VMax2& other) {}
     };
-    template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> >
+    template <> struct VMax2<int> : binary_function<int, int, int>
     {
-    };
-    template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > >
-    {
-    };
-    template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > >
-    {
-    };
-
-    template <typename T>
-    void min_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
-    {
-        cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<T>)dst, minimum<T>(), WithOutMask(), stream);
-    }
-
-    template void min_gpu<uchar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<schar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<ushort>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<short >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<int   >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<float >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<double>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T>
-    void max_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
-    {
-        cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<T>)dst, maximum<T>(), WithOutMask(), stream);
-    }
-
-    template void max_gpu<uchar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<schar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<ushort>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<short >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<int   >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<float >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<double>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T>
-    void min_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream)
-    {
-        cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, device::bind2nd(minimum<T>(), val), WithOutMask(), stream);
-    }
-
-    template void min_gpu<uchar >(const PtrStepSzb src, uchar  val, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<schar >(const PtrStepSzb src, schar  val, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<ushort>(const PtrStepSzb src, ushort val, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<short >(const PtrStepSzb src, short  val, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<int   >(const PtrStepSzb src, int    val, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<float >(const PtrStepSzb src, float  val, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<double>(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T>
-    void max_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream)
-    {
-        cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, device::bind2nd(maximum<T>(), val), WithOutMask(), stream);
-    }
-
-    template void max_gpu<uchar >(const PtrStepSzb src, uchar  val, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<schar >(const PtrStepSzb src, schar  val, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<ushort>(const PtrStepSzb src, ushort val, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<short >(const PtrStepSzb src, short  val, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<int   >(const PtrStepSzb src, int    val, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<float >(const PtrStepSzb src, float  val, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<double>(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //////////////////////////////////////////////////////////////////////////
-    // threshold
-
-    namespace detail
-    {
-        template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F>
+        __device__ __forceinline__ int operator ()(int a, int b) const
         {
-        };
-        template <typename F> struct ThresholdTraits<2, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_shift = 4 };
-        };
-        template <typename F> struct ThresholdTraits<4, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_block_dim_y = 4 };
-            enum { smart_shift = 4 };
-        };
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vmax2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmax.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VMax2() {}
+        __device__ __forceinline__ VMax2(const VMax2& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< arithm::VMax4<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    ////////////////////////////////////
+
+    template <typename T> struct TransformFunctorTraits< arithm::VMax2<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    ////////////////////////////////////
+
+    template <typename T> struct TransformFunctorTraits< maximum<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax4<T>(), WithOutMask(), stream);
     }
 
-    template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_func<T> >
+    template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-    };
-    template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_inv_func<T> >
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax2<T>(), WithOutMask(), stream);
+    }
+
+    template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-    };
-    template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_trunc_func<T> >
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
+    }
+
+    template void vmax4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vmax4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template void vmax2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vmax2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
     {
-    };
-    template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_func<T> >
-    {
-    };
-    template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> >
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
+    }
+
+    template void maxScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// threshold
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
     {
     };
 
+    template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
     template <template <typename> class Op, typename T>
-    void threshold_caller(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, T thresh, T maxVal, cudaStream_t stream)
+    void threshold_caller(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream)
     {
         Op<T> op(thresh, maxVal);
-        cv::gpu::device::transform(src, dst, op, WithOutMask(), stream);
+        transform(src, dst, op, WithOutMask(), stream);
     }
 
     template <typename T>
-    void threshold_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, T thresh, T maxVal, int type,
-        cudaStream_t stream)
+    void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream)
     {
-        typedef void (*caller_t)(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, T thresh, T maxVal, cudaStream_t stream);
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream);
 
         static const caller_t callers[] =
         {
@@ -1897,23 +2664,26 @@ namespace cv { namespace gpu { namespace device
             threshold_caller<thresh_to_zero_inv_func, T>
         };
 
-        callers[type]((PtrStepSz<T>)src, (PtrStepSz<T>)dst, thresh, maxVal, stream);
+        callers[type]((PtrStepSz<T>) src, (PtrStepSz<T>) dst, static_cast<T>(thresh), static_cast<T>(maxVal), stream);
     }
 
-    template void threshold_gpu<uchar>(const PtrStepSzb& src, const PtrStepSzb& dst, uchar thresh, uchar maxVal, int type, cudaStream_t stream);
-    template void threshold_gpu<schar>(const PtrStepSzb& src, const PtrStepSzb& dst, schar thresh, schar maxVal, int type, cudaStream_t stream);
-    template void threshold_gpu<ushort>(const PtrStepSzb& src, const PtrStepSzb& dst, ushort thresh, ushort maxVal, int type, cudaStream_t stream);
-    template void threshold_gpu<short>(const PtrStepSzb& src, const PtrStepSzb& dst, short thresh, short maxVal, int type, cudaStream_t stream);
-    template void threshold_gpu<int>(const PtrStepSzb& src, const PtrStepSzb& dst, int thresh, int maxVal, int type, cudaStream_t stream);
-    template void threshold_gpu<float>(const PtrStepSzb& src, const PtrStepSzb& dst, float thresh, float maxVal, int type, cudaStream_t stream);
-    template void threshold_gpu<double>(const PtrStepSzb& src, const PtrStepSzb& dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<uchar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<schar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<ushort>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<short>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<int>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<float>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<double>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+}
 
-    //////////////////////////////////////////////////////////////////////////
-    // pow
+//////////////////////////////////////////////////////////////////////////
+// pow
 
-    template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
+namespace arithm
+{
+    template<typename T, bool Signed = numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
     {
-        const float power;
+        float power;
 
         PowOp(double power_) : power(static_cast<float>(power_)) {}
 
@@ -1924,7 +2694,7 @@ namespace cv { namespace gpu { namespace device
     };
     template<typename T> struct PowOp<T, true> : unary_function<T, T>
     {
-        const float power;
+        float power;
 
         PowOp(double power_) : power(static_cast<float>(power_)) {}
 
@@ -1951,7 +2721,7 @@ namespace cv { namespace gpu { namespace device
     };
     template<> struct PowOp<double> : unary_function<double, double>
     {
-        const double power;
+        double power;
 
         PowOp(double power_) : power(power_) {}
 
@@ -1960,414 +2730,342 @@ namespace cv { namespace gpu { namespace device
             return ::pow(::fabs(e), power);
         }
     };
+}
 
-    namespace detail
-    {
-        template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> >
-        {
-        };
-        template <typename T> struct PowOpTraits<1, T> : DefaultTransformFunctorTraits< PowOp<T> >
-        {
-            enum { smart_block_dim_y = 8 };
-            enum { smart_shift = 8 };
-        };
-        template <typename T> struct PowOpTraits<2, T> : DefaultTransformFunctorTraits< PowOp<T> >
-        {
-            enum { smart_shift = 4 };
-        };
-        template <typename T> struct PowOpTraits<4, T> : DefaultTransformFunctorTraits< PowOp<T> >
-        {
-            enum { smart_block_dim_y = 4 };
-            enum { smart_shift = 4 };
-        };
-    }
-
-    template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T>
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< arithm::PowOp<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
     {
     };
+}}}
 
+namespace arithm
+{
     template<typename T>
-    void pow_caller(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream)
+    void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream)
     {
-        cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, PowOp<T>(power), WithOutMask(), stream);
+        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, PowOp<T>(power), WithOutMask(), stream);
     }
 
-    template void pow_caller<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-    template void pow_caller<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-    template void pow_caller<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-    template void pow_caller<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-    template void pow_caller<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-    template void pow_caller<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-    template void pow_caller<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+}
 
-    //////////////////////////////////////////////////////////////////////////
-    // addWeighted
+//////////////////////////////////////////////////////////////////////////
+// addWeighted
 
-    namespace detail
+namespace arithm
+{
+    template <typename T> struct UseDouble_
     {
-        template <typename T> struct UseDouble
-        {
-            enum {value = 0};
-        };
-        template <> struct UseDouble<int>
-        {
-            enum {value = 1};
-        };
-        template <> struct UseDouble<float>
-        {
-            enum {value = 1};
-        };
-        template <> struct UseDouble<double>
-        {
-            enum {value = 1};
-        };
-    }
+        enum {value = 0};
+    };
+    template <> struct UseDouble_<double>
+    {
+        enum {value = 1};
+    };
     template <typename T1, typename T2, typename D> struct UseDouble
     {
-        enum {value = (detail::UseDouble<T1>::value || detail::UseDouble<T2>::value || detail::UseDouble<D>::value)};
+        enum {value = (UseDouble_<T1>::value || UseDouble_<T2>::value || UseDouble_<D>::value)};
     };
 
-    namespace detail
+    template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted_;
+    template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, false> : binary_function<T1, T2, D>
     {
-        template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted;
-        template <typename T1, typename T2, typename D> struct AddWeighted<T1, T2, D, false> : binary_function<T1, T2, D>
+        float alpha;
+        float beta;
+        float gamma;
+
+        AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}
+
+        __device__ __forceinline__ D operator ()(T1 a, T2 b) const
         {
-            AddWeighted(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}
-
-            __device__ __forceinline__ D operator ()(T1 a, T2 b) const
-            {
-                return saturate_cast<D>(a * alpha + b * beta + gamma);
-            }
-
-            const float alpha;
-            const float beta;
-            const float gamma;
-        };
-        template <typename T1, typename T2, typename D> struct AddWeighted<T1, T2, D, true> : binary_function<T1, T2, D>
-        {
-            AddWeighted(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
-
-            __device__ __forceinline__ D operator ()(T1 a, T2 b) const
-            {
-                return saturate_cast<D>(a * alpha + b * beta + gamma);
-            }
-
-            const double alpha;
-            const double beta;
-            const double gamma;
-        };
-    }
-    template <typename T1, typename T2, typename D> struct AddWeighted : detail::AddWeighted<T1, T2, D, UseDouble<T1, T2, D>::value>
-    {
-        AddWeighted(double alpha_, double beta_, double gamma_) : detail::AddWeighted<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {}
-    };
-
-    template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, ushort> >
-    {
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, short> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, short> >
-    {
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AddWeighted<ushort, short, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, short, ushort> >
-    {
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AddWeighted<ushort, short, short> > : DefaultTransformFunctorTraits< AddWeighted<ushort, short, short> >
-    {
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AddWeighted<short, short, ushort> > : DefaultTransformFunctorTraits< AddWeighted<short, short, ushort> >
-    {
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AddWeighted<short, short, short> > : DefaultTransformFunctorTraits< AddWeighted<short, short, short> >
-    {
-        enum { smart_shift = 4 };
-    };
-
-    template <> struct TransformFunctorTraits< AddWeighted<int, int, int> > : DefaultTransformFunctorTraits< AddWeighted<int, int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AddWeighted<int, int, float> > : DefaultTransformFunctorTraits< AddWeighted<int, int, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AddWeighted<int, float, int> > : DefaultTransformFunctorTraits< AddWeighted<int, float, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AddWeighted<int, float, float> > : DefaultTransformFunctorTraits< AddWeighted<int, float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AddWeighted<float, float, int> > : DefaultTransformFunctorTraits< AddWeighted<float, float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< AddWeighted<float, float, float> > : DefaultTransformFunctorTraits< AddWeighted<float, float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    template <typename T1, typename T2, typename D>
-    void addWeighted_gpu(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream)
-    {
-        if (UseDouble<T1, T2, D>::value)
-        {
-            cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
-            cudaSafeCall( cudaSetDoubleForDevice(&beta) );
-            cudaSafeCall( cudaSetDoubleForDevice(&gamma) );
+            return saturate_cast<D>(a * alpha + b * beta + gamma);
         }
+    };
+    template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, true> : binary_function<T1, T2, D>
+    {
+        double alpha;
+        double beta;
+        double gamma;
 
+        AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
+
+        __device__ __forceinline__ D operator ()(T1 a, T2 b) const
+        {
+            return saturate_cast<D>(a * alpha + b * beta + gamma);
+        }
+    };
+    template <typename T1, typename T2, typename D> struct AddWeighted : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>
+    {
+        AddWeighted(double alpha_, double beta_, double gamma_) : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T1, typename T2, typename D, size_t src1_size, size_t src2_size, size_t dst_size> struct AddWeightedTraits : DefaultTransformFunctorTraits< arithm::AddWeighted<T1, T2, D> >
+    {
+    };
+    template <typename T1, typename T2, typename D, size_t src_size, size_t dst_size> struct AddWeightedTraits<T1, T2, D, src_size, src_size, dst_size> : arithm::ArithmFuncTraits<src_size, dst_size>
+    {
+    };
+
+    template <typename T1, typename T2, typename D> struct TransformFunctorTraits< arithm::AddWeighted<T1, T2, D> > : AddWeightedTraits<T1, T2, D, sizeof(T1), sizeof(T2), sizeof(D)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T1, typename T2, typename D>
+    void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream)
+    {
         AddWeighted<T1, T2, D> op(alpha, beta, gamma);
 
-        cv::gpu::device::transform(static_cast< PtrStepSz<T1> >(src1), static_cast< PtrStepSz<T2> >(src2), static_cast< PtrStepSz<D> >(dst), op, WithOutMask(), stream);
+        transform((PtrStepSz<T1>) src1, (PtrStepSz<T2>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
     }
 
-    template void addWeighted_gpu<uchar, uchar, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, uchar, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, uchar, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, uchar, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, uchar, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, uchar, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, uchar, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<uchar, schar, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, schar, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, schar, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, schar, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, schar, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, schar, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, schar, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<uchar, ushort, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, ushort, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, ushort, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, ushort, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, ushort, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, ushort, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, ushort, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<uchar, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<uchar, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<uchar, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<uchar, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
 
 
-    template void addWeighted_gpu<schar, schar, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, schar, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, schar, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, schar, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, schar, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, schar, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, schar, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<schar, ushort, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, ushort, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, ushort, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, ushort, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, ushort, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, ushort, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, ushort, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<schar, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<schar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<schar, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<schar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<schar, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<schar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<schar, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<schar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
 
 
-    template void addWeighted_gpu<ushort, ushort, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, ushort, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, ushort, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, ushort, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, ushort, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, ushort, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, ushort, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<ushort, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<ushort, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<ushort, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<ushort, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
 
 
-    template void addWeighted_gpu<short, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<short, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<short, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<short, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<short, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<short, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<short, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<short, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
 
 
-    template void addWeighted_gpu<int, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<int, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<int, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<int, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<int, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<int, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
 
 
-    template void addWeighted_gpu<float, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<float, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<float, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<float, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
 
 
-    template void addWeighted_gpu<double, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<double, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<double, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<double, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<double, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<double, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<double, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-}}} // namespace cv { namespace gpu { namespace device
+    template void addWeighted<double, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+}
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/fgd_bgfg.cu b/modules/gpu/src/cuda/fgd_bgfg.cu
index 6040d021b8..a14da0f854 100644
--- a/modules/gpu/src/cuda/fgd_bgfg.cu
+++ b/modules/gpu/src/cuda/fgd_bgfg.cu
@@ -46,6 +46,8 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/gpu/device/functional.hpp"
 #include "fgd_bgfg_common.hpp"
 
 using namespace cv::gpu;
@@ -181,57 +183,8 @@ namespace bgfg
         __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
         __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
 
-        data0[threadIdx.x] = sum0;
-        data1[threadIdx.x] = sum1;
-        data2[threadIdx.x] = sum2;
-        __syncthreads();
-
-        if (threadIdx.x < 128)
-        {
-            data0[threadIdx.x] = sum0 += data0[threadIdx.x + 128];
-            data1[threadIdx.x] = sum1 += data1[threadIdx.x + 128];
-            data2[threadIdx.x] = sum2 += data2[threadIdx.x + 128];
-        }
-        __syncthreads();
-
-        if (threadIdx.x < 64)
-        {
-            data0[threadIdx.x] = sum0 += data0[threadIdx.x + 64];
-            data1[threadIdx.x] = sum1 += data1[threadIdx.x + 64];
-            data2[threadIdx.x] = sum2 += data2[threadIdx.x + 64];
-        }
-        __syncthreads();
-
-        if (threadIdx.x < 32)
-        {
-            volatile unsigned int* vdata0 = data0;
-            volatile unsigned int* vdata1 = data1;
-            volatile unsigned int* vdata2 = data2;
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 32];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 32];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 32];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 16];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 16];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 16];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 8];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 8];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 8];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 4];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 4];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 4];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 2];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 2];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 2];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 1];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 1];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 1];
-        }
+        plus<unsigned int> op;
+        reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
 
         if(threadIdx.x == 0)
         {
@@ -245,9 +198,9 @@ namespace bgfg
     void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
                                unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
                                unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
-                               int cc, cudaStream_t stream)
+                               bool cc20, cudaStream_t stream)
     {
-        const int HISTOGRAM_WARP_COUNT = cc < 20 ? 4 : 6;
+        const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
         const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
 
         calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
@@ -261,10 +214,10 @@ namespace bgfg
             cudaSafeCall( cudaDeviceSynchronize() );
     }
 
-    template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
 
     /////////////////////////////////////////////////////////////////////////
     // calcDiffThreshMask
@@ -845,4 +798,4 @@ namespace bgfg
     template void updateBackgroundModel_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
 }
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/fgd_bgfg_common.hpp b/modules/gpu/src/cuda/fgd_bgfg_common.hpp
index 433af80f7f..986952e439 100644
--- a/modules/gpu/src/cuda/fgd_bgfg_common.hpp
+++ b/modules/gpu/src/cuda/fgd_bgfg_common.hpp
@@ -125,7 +125,7 @@ namespace bgfg
     void calcDiffHistogram_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
                                unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
                                unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
-                               int cc, cudaStream_t stream);
+                               bool cc20, cudaStream_t stream);
 
     template <typename PT, typename CT>
     void calcDiffThreshMask_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);
diff --git a/modules/gpu/src/cuda/gftt.cu b/modules/gpu/src/cuda/gftt.cu
index 4c21bd9119..cae217e394 100644
--- a/modules/gpu/src/cuda/gftt.cu
+++ b/modules/gpu/src/cuda/gftt.cu
@@ -47,6 +47,7 @@
 
 #if !defined CUDA_DISABLER
 
+#include <thrust/device_ptr.h>
 #include <thrust/sort.h>
 
 #include "opencv2/gpu/device/common.hpp"
@@ -148,4 +149,4 @@ namespace cv { namespace gpu { namespace device
 }}}
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/global_motion.cu b/modules/gpu/src/cuda/global_motion.cu
index c48274a413..2af56568da 100644
--- a/modules/gpu/src/cuda/global_motion.cu
+++ b/modules/gpu/src/cuda/global_motion.cu
@@ -43,12 +43,11 @@
 
 #if !defined CUDA_DISABLER
 
-#include "thrust/device_ptr.h"
-#include "thrust/remove.h"
-#include "thrust/functional.h"
-#include "internal_shared.hpp"
+#include <thrust/device_ptr.h>
+#include <thrust/remove.h>
+#include <thrust/functional.h>
 
-using namespace thrust;
+#include "internal_shared.hpp"
 
 namespace cv { namespace gpu { namespace device { namespace globmotion {
 
@@ -64,7 +63,7 @@ int compactPoints(int N, float *points0, float *points1, const uchar *mask)
     return thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
                              thrust::make_zip_iterator(thrust::make_tuple(dpoints0 + N, dpoints1 + N)),
                              dmask, thrust::not1(thrust::identity<uchar>()))
-           - make_zip_iterator(make_tuple(dpoints0, dpoints1));
+           - thrust::make_zip_iterator(make_tuple(dpoints0, dpoints1));
 }
 
 
@@ -117,4 +116,4 @@ void calcWobbleSuppressionMaps(
 }}}}
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/hist.cu b/modules/gpu/src/cuda/hist.cu
index 3a2b59b857..9dd14182be 100644
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -43,182 +43,112 @@
 
 #if !defined CUDA_DISABLER
 
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/emulation.hpp"
+#include "opencv2/gpu/device/transform.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace hist
+{
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
+    {
+        __shared__ int shist[256];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        shist[tid] = 0;
+        __syncthreads();
+
+        if (y < rows)
+        {
+            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
+
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+            {
+                unsigned int data = rowPtr[x];
+
+                Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
+            }
+
+            if (cols % 4 != 0 && threadIdx.x == 0)
+            {
+                for (int x = cols_4 * 4; x < cols; ++x)
+                {
+                    unsigned int data = ((const uchar*)rowPtr)[x];
+                    Emulation::smem::atomicAdd(&shist[data], 1);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        const int histVal = shist[tid];
+        if (histVal > 0)
+            ::atomicAdd(hist + tid, histVal);
+    }
+
+    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.rows, block.y));
+
+        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+namespace hist
+{
+    __constant__ int c_lut[256];
+
+    struct EqualizeHist : unary_function<uchar, uchar>
+    {
+        float scale;
+
+        __host__ EqualizeHist(float _scale) : scale(_scale) {}
+
+        __device__ __forceinline__ uchar operator ()(uchar val) const
+        {
+            const int lut = c_lut[val];
+            return __float2int_rn(scale * lut);
+        }
+    };
+}
 
 namespace cv { namespace gpu { namespace device
 {
-    #define UINT_BITS 32U
-
-    //Warps == subhistograms per threadblock
-    #define WARP_COUNT 6
-
-    //Threadblock size
-    #define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
-    #define HISTOGRAM256_BIN_COUNT 256
-
-    //Shared memory per threadblock
-    #define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
-
-    #define PARTIAL_HISTOGRAM256_COUNT 240
-
-    #define MERGE_THREADBLOCK_SIZE 256
-
-    #define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120))
-
-    namespace hist
+    template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
     {
-        #if (!USE_SMEM_ATOMICS)
-
-            #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
-
-            __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
-            {
-                uint count;
-                do
-                {
-                    count = s_WarpHist[data] & TAG_MASK;
-                    count = threadTag | (count + 1);
-                    s_WarpHist[data] = count;
-                } while (s_WarpHist[data] != count);
-            }
-
-        #else
-
-            #define TAG_MASK 0xFFFFFFFFU
-
-            __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
-            {
-                atomicAdd(s_WarpHist + data, 1);
-            }
-
-        #endif
-
-        __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
-        {
-            uint x = pos_x << 2;
-
-            if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
-            if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
-            if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
-            if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
-        }
-
-        __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
-        {
-            //Per-warp subhistogram storage
-            __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
-            uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
-
-            //Clear shared memory storage for current threadblock before processing
-            #pragma unroll
-            for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
-               s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
-
-            //Cycle through the entire data set, update subhistograms for each warp
-            const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
-
-            __syncthreads();
-            const uint colsui = d_Data.step / sizeof(uint);
-            for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
-            {
-                uint pos_y = pos / colsui;
-                uint pos_x = pos % colsui;
-                uint data = d_Data.ptr(pos_y)[pos_x];
-                addWord(s_WarpHist, data, tag, pos_x, cols);
-            }
-
-            //Merge per-warp histograms into per-block and write to global memory
-            __syncthreads();
-            for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
-            {
-                uint sum = 0;
-
-                for (uint i = 0; i < WARP_COUNT; i++)
-                    sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
-
-                d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
-            }
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////
-        // Merge histogram256() output
-        // Run one threadblock per bin; each threadblock adds up the same bin counter
-        // from every partial histogram. Reads are uncoalesced, but mergeHistogram256
-        // takes only a fraction of total processing time
-        ////////////////////////////////////////////////////////////////////////////////
-
-        __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
-        {
-            uint sum = 0;
-
-            #pragma unroll
-            for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
-                sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
-
-            __shared__ uint data[MERGE_THREADBLOCK_SIZE];
-            data[threadIdx.x] = sum;
-
-            for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
-            {
-                __syncthreads();
-                if(threadIdx.x < stride)
-                    data[threadIdx.x] += data[threadIdx.x + stride];
-            }
-
-            if(threadIdx.x == 0)
-                d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
-        }
-
-        void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream)
-        {
-            histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
-                PtrStepSz<uint>(src),
-                buf,
-                static_cast<uint>(src.rows * src.step / sizeof(uint)),
-                src.cols);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __constant__ int c_lut[256];
-
-        __global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < src.cols && y < src.rows)
-            {
-                const uchar val = src.ptr(y)[x];
-                const int lut = c_lut[val];
-                dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
-            }
-        }
-
-        void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
-        {
-            dim3 block(16, 16);
-            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+        enum { smart_shift = 4 };
+    };
+}}}
 
+namespace hist
+{
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
+    {
+        if (stream == 0)
             cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
+        else
+            cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
 
-            equalizeHist<<<grid, block, 0, stream>>>(src, dst);
-            cudaSafeCall( cudaGetLastError() );
+        const float scale = 255.0f / (src.cols * src.rows);
 
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    } // namespace hist
-}}} // namespace cv { namespace gpu { namespace device
+        transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
+    }
+}
 
-
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/hog.cu b/modules/gpu/src/cuda/hog.cu
index 953fdec1dc..523e5bf645 100644
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@@ -42,7 +42,10 @@
 
 #if !defined CUDA_DISABLER
 
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
@@ -226,29 +229,32 @@ namespace cv { namespace gpu { namespace device
 
 
         template<int size>
-        __device__ float reduce_smem(volatile float* smem)
+        __device__ float reduce_smem(float* smem, float val)
         {
             unsigned int tid = threadIdx.x;
-            float sum = smem[tid];
+            float sum = val;
 
-            if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }
-            if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
-            if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
+            reduce<size>(smem, sum, tid, plus<float>());
 
-            if (tid < 32)
+            if (size == 32)
             {
-                if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
-                if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
-                if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
-                if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
-                if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
-                if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
+            #if __CUDA_ARCH__ >= 300
+                return shfl(sum, 0);
+            #else
+                return smem[0];
+            #endif
             }
+            else
+            {
+            #if __CUDA_ARCH__ >= 300
+                if (threadIdx.x == 0)
+                    smem[0] = sum;
+            #endif
 
-            __syncthreads();
-            sum = smem[0];
+                __syncthreads();
 
-            return sum;
+                return smem[0];
+            }
         }
 
 
@@ -272,19 +278,13 @@ namespace cv { namespace gpu { namespace device
             if (threadIdx.x < block_hist_size)
                 elem = hist[0];
 
-            squares[threadIdx.x] = elem * elem;
-
-            __syncthreads();
-            float sum = reduce_smem<nthreads>(squares);
+            float sum = reduce_smem<nthreads>(squares, elem * elem);
 
             float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
             elem = ::min(elem * scale, threshold);
 
-            __syncthreads();
-            squares[threadIdx.x] = elem * elem;
+            sum = reduce_smem<nthreads>(squares, elem * elem);
 
-            __syncthreads();
-            sum = reduce_smem<nthreads>(squares);
             scale = 1.0f / (::sqrtf(sum) + 1e-3f);
 
             if (threadIdx.x < block_hist_size)
@@ -330,65 +330,36 @@ namespace cv { namespace gpu { namespace device
 
        // return confidence values not just positive location
        template <int nthreads, // Number of threads per one histogram block
-                           int nblocks> // Number of histogram block processed by single GPU thread block
+                 int nblocks>  // Number of histogram block processed by single GPU thread block
        __global__ void compute_confidence_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
                                                                                                            const int win_block_stride_x, const int win_block_stride_y,
                                                                                                            const float* block_hists, const float* coefs,
                                                                                                            float free_coef, float threshold, float* confidences)
        {
-               const int win_x = threadIdx.z;
-               if (blockIdx.x * blockDim.z + win_x >= img_win_width)
-                       return;
+           const int win_x = threadIdx.z;
+           if (blockIdx.x * blockDim.z + win_x >= img_win_width)
+                   return;
 
-               const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
-                                                                                    blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
-                                                                                   cblock_hist_size;
+           const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
+                                                                                blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
+                                                                               cblock_hist_size;
 
-               float product = 0.f;
-               for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
-               {
-                       int offset_y = i / cdescr_width;
-                       int offset_x = i - offset_y * cdescr_width;
-                       product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
-               }
+           float product = 0.f;
+           for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+           {
+                   int offset_y = i / cdescr_width;
+                   int offset_x = i - offset_y * cdescr_width;
+                   product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+           }
 
-               __shared__ float products[nthreads * nblocks];
+           __shared__ float products[nthreads * nblocks];
 
-               const int tid = threadIdx.z * nthreads + threadIdx.x;
-               products[tid] = product;
+           const int tid = threadIdx.z * nthreads + threadIdx.x;
 
-               __syncthreads();
+           reduce<nthreads>(products, product, tid, plus<float>());
 
-               if (nthreads >= 512)
-               {
-                       if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
-                       __syncthreads();
-               }
-               if (nthreads >= 256)
-               {
-                       if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
-                       __syncthreads();
-               }
-               if (nthreads >= 128)
-               {
-                       if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
-                       __syncthreads();
-               }
-
-               if (threadIdx.x < 32)
-               {
-                       volatile float* smem = products;
-                       if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
-                       if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
-                       if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
-                       if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
-                       if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
-                       if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
-               }
-
-               if (threadIdx.x == 0)
-                       confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x]
-                               = (float)(product + free_coef);
+           if (threadIdx.x == 0)
+               confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef;
 
        }
 
@@ -396,32 +367,32 @@ namespace cv { namespace gpu { namespace device
                                                int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
                                                float* coefs, float free_coef, float threshold, float *confidences)
        {
-               const int nthreads = 256;
-               const int nblocks = 1;
+           const int nthreads = 256;
+           const int nblocks = 1;
 
-               int win_block_stride_x = win_stride_x / block_stride_x;
-               int win_block_stride_y = win_stride_y / block_stride_y;
-               int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
-               int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+           int win_block_stride_x = win_stride_x / block_stride_x;
+           int win_block_stride_y = win_stride_y / block_stride_y;
+           int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+           int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
 
-               dim3 threads(nthreads, 1, nblocks);
-               dim3 grid(divUp(img_win_width, nblocks), img_win_height);
+           dim3 threads(nthreads, 1, nblocks);
+           dim3 grid(divUp(img_win_width, nblocks), img_win_height);
 
-               cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
-                                                                                       cudaFuncCachePreferL1));
+           cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
+                                                                                   cudaFuncCachePreferL1));
 
-               int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
-                                                           block_stride_x;
-               compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
-                       img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
-                       block_hists, coefs, free_coef, threshold, confidences);
-               cudaSafeCall(cudaThreadSynchronize());
+           int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+                                                       block_stride_x;
+           compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
+                   img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
+                   block_hists, coefs, free_coef, threshold, confidences);
+           cudaSafeCall(cudaThreadSynchronize());
        }
 
 
 
         template <int nthreads, // Number of threads per one histogram block
-                  int nblocks> // Number of histogram block processed by single GPU thread block
+                  int nblocks>  // Number of histogram block processed by single GPU thread block
         __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
                                                           const int win_block_stride_x, const int win_block_stride_y,
                                                           const float* block_hists, const float* coefs,
@@ -446,36 +417,8 @@ namespace cv { namespace gpu { namespace device
             __shared__ float products[nthreads * nblocks];
 
             const int tid = threadIdx.z * nthreads + threadIdx.x;
-            products[tid] = product;
 
-            __syncthreads();
-
-            if (nthreads >= 512)
-            {
-                if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
-                __syncthreads();
-            }
-            if (nthreads >= 256)
-            {
-                if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
-                __syncthreads();
-            }
-            if (nthreads >= 128)
-            {
-                if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
-                __syncthreads();
-            }
-
-            if (threadIdx.x < 32)
-            {
-                volatile float* smem = products;
-                if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
-                if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
-                if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
-                if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
-                if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
-                if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
-            }
+            reduce<nthreads>(products, product, tid, plus<float>());
 
             if (threadIdx.x == 0)
                 labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
@@ -868,4 +811,4 @@ namespace cv { namespace gpu { namespace device
 }}} // namespace cv { namespace gpu { namespace device
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu
index ee4d02591b..695a47def4 100644
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@@ -42,7 +42,9 @@
 
 #if !defined CUDA_DISABLER
 
+#include <thrust/device_ptr.h>
 #include <thrust/sort.h>
+
 #include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/emulation.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
@@ -291,6 +293,201 @@ namespace cv { namespace gpu { namespace device
             return totalCount;
         }
 
+        ////////////////////////////////////////////////////////////////////////
+        // houghLinesProbabilistic
+
+        texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_mask(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __global__ void houghLinesProbabilistic(const PtrStepSzi accum,
+                                                int4* out, const int maxSize,
+                                                const float rho, const float theta,
+                                                const int lineGap, const int lineLength,
+                                                const int rows, const int cols)
+        {
+            const int r = blockIdx.x * blockDim.x + threadIdx.x;
+            const int n = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (r >= accum.cols - 2 || n >= accum.rows - 2)
+                return;
+
+            const int curVotes = accum(n + 1, r + 1);
+
+            if (curVotes >= lineLength &&
+                curVotes > accum(n, r) &&
+                curVotes > accum(n, r + 1) &&
+                curVotes > accum(n, r + 2) &&
+                curVotes > accum(n + 1, r) &&
+                curVotes > accum(n + 1, r + 2) &&
+                curVotes > accum(n + 2, r) &&
+                curVotes > accum(n + 2, r + 1) &&
+                curVotes > accum(n + 2, r + 2))
+            {
+                const float radius = (r - (accum.cols - 2 - 1) * 0.5f) * rho;
+                const float angle = n * theta;
+
+                float cosa;
+                float sina;
+                sincosf(angle, &sina, &cosa);
+
+                float2 p0 = make_float2(cosa * radius, sina * radius);
+                float2 dir = make_float2(-sina, cosa);
+
+                float2 pb[4] = {make_float2(-1, -1), make_float2(-1, -1), make_float2(-1, -1), make_float2(-1, -1)};
+                float a;
+
+                if (dir.x != 0)
+                {
+                    a = -p0.x / dir.x;
+                    pb[0].x = 0;
+                    pb[0].y = p0.y + a * dir.y;
+
+                    a = (cols - 1 - p0.x) / dir.x;
+                    pb[1].x = cols - 1;
+                    pb[1].y = p0.y + a * dir.y;
+                }
+                if (dir.y != 0)
+                {
+                    a = -p0.y / dir.y;
+                    pb[2].x = p0.x + a * dir.x;
+                    pb[2].y = 0;
+
+                    a = (rows - 1 - p0.y) / dir.y;
+                    pb[3].x = p0.x + a * dir.x;
+                    pb[3].y = rows - 1;
+                }
+
+                if (pb[0].x == 0 && (pb[0].y >= 0 && pb[0].y < rows))
+                {
+                    p0 = pb[0];
+                    if (dir.x < 0)
+                        dir = -dir;
+                }
+                else if (pb[1].x == cols - 1 && (pb[0].y >= 0 && pb[0].y < rows))
+                {
+                    p0 = pb[1];
+                    if (dir.x > 0)
+                        dir = -dir;
+                }
+                else if (pb[2].y == 0 && (pb[2].x >= 0 && pb[2].x < cols))
+                {
+                    p0 = pb[2];
+                    if (dir.y < 0)
+                        dir = -dir;
+                }
+                else if (pb[3].y == rows - 1 && (pb[3].x >= 0 && pb[3].x < cols))
+                {
+                    p0 = pb[3];
+                    if (dir.y > 0)
+                        dir = -dir;
+                }
+
+                float2 d;
+                if (::fabsf(dir.x) > ::fabsf(dir.y))
+                {
+                    d.x = dir.x > 0 ? 1 : -1;
+                    d.y = dir.y / ::fabsf(dir.x);
+                }
+                else
+                {
+                    d.x = dir.x / ::fabsf(dir.y);
+                    d.y = dir.y > 0 ? 1 : -1;
+                }
+
+                float2 line_end[2];
+                int gap;
+                bool inLine = false;
+
+                float2 p1 = p0;
+                if (p1.x < 0 || p1.x >= cols || p1.y < 0 || p1.y >= rows)
+                    return;
+
+                for (;;)
+                {
+                    if (tex2D(tex_mask, p1.x, p1.y))
+                    {
+                        gap = 0;
+
+                        if (!inLine)
+                        {
+                            line_end[0] = p1;
+                            line_end[1] = p1;
+                            inLine = true;
+                        }
+                        else
+                        {
+                            line_end[1] = p1;
+                        }
+                    }
+                    else if (inLine)
+                    {
+                        if (++gap > lineGap)
+                        {
+                            bool good_line = ::abs(line_end[1].x - line_end[0].x) >= lineLength ||
+                                             ::abs(line_end[1].y - line_end[0].y) >= lineLength;
+
+                            if (good_line)
+                            {
+                                const int ind = ::atomicAdd(&g_counter, 1);
+                                if (ind < maxSize)
+                                    out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
+                            }
+
+                            gap = 0;
+                            inLine = false;
+                        }
+                    }
+
+                    p1 = p1 + d;
+                    if (p1.x < 0 || p1.x >= cols || p1.y < 0 || p1.y >= rows)
+                    {
+                        if (inLine)
+                        {
+                            bool good_line = ::abs(line_end[1].x - line_end[0].x) >= lineLength ||
+                                             ::abs(line_end[1].y - line_end[0].y) >= lineLength;
+
+                            if (good_line)
+                            {
+                                const int ind = ::atomicAdd(&g_counter, 1);
+                                if (ind < maxSize)
+                                    out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
+                            }
+
+                        }
+                        break;
+                    }
+                }
+            }
+        }
+
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
+
+            bindTexture(&tex_mask, mask);
+
+            houghLinesProbabilistic<<<grid, block>>>(accum,
+                                                     out, maxSize,
+                                                     rho, theta,
+                                                     lineGap, lineLength,
+                                                     mask.rows, mask.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            return totalCount;
+        }
+
         ////////////////////////////////////////////////////////////////////////
         // circlesAccumCenters
 
@@ -1509,4 +1706,4 @@ namespace cv { namespace gpu { namespace device
 }}}
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/icf-sc.cu b/modules/gpu/src/cuda/icf-sc.cu
new file mode 100644
index 0000000000..aa695372e7
--- /dev/null
+++ b/modules/gpu/src/cuda/icf-sc.cu
@@ -0,0 +1,563 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <opencv2/gpu/device/common.hpp>
+#include <opencv2/gpu/device/saturate_cast.hpp>
+
+#include <icf.hpp>
+#include <float.h>
+#include <stdio.h>
+
+namespace cv { namespace gpu { namespace device {
+namespace icf {
+
+    template <int FACTOR>
+    __device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x)
+    {
+        int out = 0;
+#pragma unroll
+        for(int dy = 0; dy < FACTOR; ++dy)
+#pragma unroll
+            for(int dx = 0; dx < FACTOR; ++dx)
+            {
+                out += ptr[dy * pitch + dx];
+            }
+
+        return static_cast<uchar>(out / (FACTOR * FACTOR));
+    }
+
+    template<int FACTOR>
+    __global__ void shrink(const uchar* __restrict__ hogluv, const int inPitch,
+                                 uchar* __restrict__ shrank, const int outPitch )
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+        const uchar* ptr = hogluv + (FACTOR * y) * inPitch + (FACTOR * x);
+
+        shrank[ y * outPitch + x] = shrink<FACTOR>(ptr, inPitch, y, x);
+    }
+
+    void shrink(const cv::gpu::PtrStepSzb& channels, cv::gpu::PtrStepSzb shrunk)
+    {
+        dim3 block(32, 8);
+        dim3 grid(shrunk.cols / 32, shrunk.rows / 8);
+        shrink<4><<<grid, block>>>((uchar*)channels.ptr(), channels.step, (uchar*)shrunk.ptr(), shrunk.step);
+        cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+    __device__ __forceinline__ void luv(const float& b, const float& g, const float& r, uchar& __l, uchar& __u, uchar& __v)
+    {
+        // rgb -> XYZ
+        float x = 0.412453f * r + 0.357580f * g + 0.180423f * b;
+        float y = 0.212671f * r + 0.715160f * g + 0.072169f * b;
+        float z = 0.019334f * r + 0.119193f * g + 0.950227f * b;
+
+        // computed for D65
+        const float _ur = 0.19783303699678276f;
+        const float _vr = 0.46833047435252234f;
+
+        const float divisor = fmax((x + 15.f * y + 3.f * z), FLT_EPSILON);
+        const float _u = __fdividef(4.f * x, divisor);
+        const float _v = __fdividef(9.f * y, divisor);
+
+        float hack = static_cast<float>(__float2int_rn(y * 2047)) / 2047;
+        const float L = fmax(0.f, ((116.f * cbrtf(hack)) - 16.f));
+        const float U = 13.f * L * (_u - _ur);
+        const float V = 13.f * L * (_v - _vr);
+
+        // L in [0, 100], u in [-134, 220], v in [-140, 122]
+        __l = static_cast<uchar>( L * (255.f / 100.f));
+        __u = static_cast<uchar>((U + 134.f) * (255.f / (220.f + 134.f )));
+        __v = static_cast<uchar>((V + 140.f) * (255.f / (122.f + 140.f )));
+    }
+
+    __global__ void bgr2Luv_d(const uchar* rgb, const int rgbPitch, uchar* luvg, const int luvgPitch)
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+        uchar3 color = ((uchar3*)(rgb + rgbPitch * y))[x];
+        uchar l, u, v;
+        luv(color.x / 255.f, color.y / 255.f, color.z / 255.f, l, u, v);
+
+        luvg[luvgPitch *  y + x] = l;
+        luvg[luvgPitch * (y + 480) + x] = u;
+        luvg[luvgPitch * (y + 2 * 480) + x] = v;
+    }
+
+    void bgr2Luv(const PtrStepSzb& bgr, PtrStepSzb luv)
+    {
+        dim3 block(32, 8);
+        dim3 grid(bgr.cols / 32, bgr.rows / 8);
+
+        bgr2Luv_d<<<grid, block>>>((const uchar*)bgr.ptr(0), bgr.step, (uchar*)luv.ptr(0), luv.step);
+
+        cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+    template<bool isDefaultNum>
+    __device__ __forceinline__ int fast_angle_bin(const float& dx, const float& dy)
+    {
+        const float angle_quantum = CV_PI / 6.f;
+        float angle = atan2(dx, dy) + (angle_quantum / 2.f);
+
+        if (angle < 0) angle += CV_PI;
+
+        const float angle_scaling = 1.f / angle_quantum;
+        return static_cast<int>(angle * angle_scaling) % 6;
+    }
+
+    template<>
+    __device__ __forceinline__ int fast_angle_bin<true>(const float& dy, const float& dx)
+    {
+        int index = 0;
+
+        float max_dot = fabs(dx);
+
+        {
+            const float dot_product = fabs(dx * 0.8660254037844386f + dy * 0.5f);
+
+            if(dot_product > max_dot)
+            {
+                max_dot = dot_product;
+                index = 1;
+            }
+        }
+        {
+            const float dot_product = fabs(dy * 0.8660254037844386f + dx * 0.5f);
+
+            if(dot_product > max_dot)
+            {
+                max_dot = dot_product;
+                index = 2;
+            }
+        }
+        {
+            int i = 3;
+            float2 bin_vector_i;
+            bin_vector_i.x = ::cos(i * (CV_PI / 6.f));
+            bin_vector_i.y = ::sin(i * (CV_PI / 6.f));
+
+            const float dot_product = fabs(dx * bin_vector_i.x + dy * bin_vector_i.y);
+            if(dot_product > max_dot)
+            {
+                max_dot = dot_product;
+                index = i;
+            }
+        }
+        {
+            const float dot_product = fabs(dx * (-0.4999999999999998f) + dy * 0.8660254037844387f);
+            if(dot_product > max_dot)
+            {
+                max_dot = dot_product;
+                index = 4;
+            }
+        }
+        {
+            const float dot_product = fabs(dx * (-0.8660254037844387f) + dy * 0.49999999999999994f);
+            if(dot_product > max_dot)
+            {
+                max_dot = dot_product;
+                index = 5;
+            }
+        }
+        return index;
+    }
+
+    texture<uchar,  cudaTextureType2D, cudaReadModeElementType> tgray;
+
+    template<bool isDefaultNum>
+    __global__ void gray2hog(PtrStepSzb mag)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        const float dx = tex2D(tgray, x + 1, y + 0) - tex2D(tgray, x - 1, y - 0);
+        const float dy = tex2D(tgray, x + 0, y + 1) - tex2D(tgray, x - 0, y - 1);
+
+        const float magnitude = sqrtf((dx * dx) + (dy * dy)) * (1.0f / sqrtf(2));
+        const uchar cmag = static_cast<uchar>(magnitude);
+
+        mag( 480 * 6 + y, x) = cmag;
+        mag( 480 * fast_angle_bin<isDefaultNum>(dy, dx) + y, x) = cmag;
+    }
+
+    void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins)
+    {
+        dim3 block(32, 8);
+        dim3 grid(gray.cols / 32, gray.rows / 8);
+
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>();
+        cudaSafeCall( cudaBindTexture2D(0, tgray, gray.data, desc, gray.cols, gray.rows, gray.step) );
+
+        if (bins == 6)
+            gray2hog<true><<<grid, block>>>(mag);
+        else
+            gray2hog<false><<<grid, block>>>(mag);
+
+        cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+    // ToDo: use textures or uncached load instruction.
+    __global__ void magToHist(const uchar* __restrict__ mag,
+                              const float* __restrict__ angle, const int angPitch,
+                                    uchar* __restrict__ hog,   const int hogPitch, const int fh)
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+        const int bin = (int)(angle[y * angPitch + x]);
+        const uchar val = mag[y * hogPitch + x];
+        hog[((fh * bin) + y) * hogPitch + x] = val;
+    }
+
+    void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
+                  const int fw,  const int fh, const int bins, cudaStream_t stream )
+    {
+        const uchar* mag = (const uchar*)hogluv.ptr(fh * bins);
+        uchar* hog = (uchar*)hogluv.ptr();
+        const float* angle = (const float*)nangle.ptr();
+
+        dim3 block(32, 8);
+        dim3 grid(fw / 32, fh / 8);
+
+        magToHist<<<grid, block, 0, stream>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step, fh);
+        if (!stream)
+        {
+            cudaSafeCall( cudaGetLastError() );
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+
+    __device__ __forceinline__ float overlapArea(const Detection &a, const Detection &b)
+    {
+        int w = ::min(a.x + a.w, b.x + b.w) - ::max(a.x, b.x);
+        int h = ::min(a.y + a.h, b.y + b.h) - ::max(a.y, b.y);
+
+        return (w < 0 || h < 0)? 0.f : (float)(w * h);
+    }
+
+    texture<uint4,  cudaTextureType2D, cudaReadModeElementType> tdetections;
+
+    __global__ void overlap(const uint* n, uchar* overlaps)
+    {
+        const int idx = threadIdx.x;
+        const int total = *n;
+
+        for (int i = idx + 1; i < total; i += 192)
+        {
+            const uint4 _a = tex2D(tdetections, i, 0);
+            const Detection& a = *((Detection*)(&_a));
+            bool excluded = false;
+
+            for (int j = i + 1; j < total; ++j)
+            {
+                const uint4 _b = tex2D(tdetections, j, 0);
+                const Detection& b = *((Detection*)(&_b));
+                float ovl = overlapArea(a, b) / ::min(a.w * a.h, b.w * b.h);
+
+                if (ovl > 0.65f)
+                {
+                    int suppessed = (a.confidence > b.confidence)? j : i;
+                    overlaps[suppessed] = 1;
+                    excluded = excluded || (suppessed == i);
+                }
+
+            #if __CUDA_ARCH__ >= 120
+                if (__all(excluded)) break;
+            #endif
+            }
+        }
+    }
+
+    __global__ void collect(const uint* n, uchar* overlaps, uint* ctr, uint4* suppressed)
+    {
+        const int idx = threadIdx.x;
+        const int total = *n;
+
+        for (int i = idx; i < total; i += 192)
+        {
+            if (!overlaps[i])
+            {
+                int oidx = atomicInc(ctr, 50);
+                suppressed[oidx] = tex2D(tdetections, i + 1, 0);
+            }
+        }
+    }
+
+    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
+        PtrStepSzb suppressed, cudaStream_t stream)
+    {
+        int block = 192;
+        int grid = 1;
+
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<uint4>();
+        size_t offset;
+        cudaSafeCall( cudaBindTexture2D(&offset, tdetections, objects.data, desc, objects.cols / sizeof(uint4), objects.rows, objects.step));
+
+        overlap<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0));
+        collect<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0), (uint*)suppressed.ptr(0), ((uint4*)suppressed.ptr(0)) + 1);
+
+        if (!stream)
+        {
+            cudaSafeCall( cudaGetLastError());
+            cudaSafeCall( cudaDeviceSynchronize());
+        }
+    }
+
+    template<typename Policy>
+    struct PrefixSum
+    {
+    __device static void apply(float& impact)
+        {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+    #pragma unroll
+            // scan on shuffl functions
+            for (int i = 1; i < Policy::WARP; i *= 2)
+            {
+                const float n = __shfl_up(impact, i, Policy::WARP);
+
+                if (threadIdx.x >= i)
+                    impact += n;
+            }
+    #else
+            __shared__ volatile float ptr[Policy::STA_X * Policy::STA_Y];
+
+            const int idx = threadIdx.y * Policy::STA_X + threadIdx.x;
+
+            ptr[idx] = impact;
+
+            if ( threadIdx.x >=  1) ptr [idx ] = (ptr [idx -  1] + ptr [idx]);
+            if ( threadIdx.x >=  2) ptr [idx ] = (ptr [idx -  2] + ptr [idx]);
+            if ( threadIdx.x >=  4) ptr [idx ] = (ptr [idx -  4] + ptr [idx]);
+            if ( threadIdx.x >=  8) ptr [idx ] = (ptr [idx -  8] + ptr [idx]);
+            if ( threadIdx.x >= 16) ptr [idx ] = (ptr [idx - 16] + ptr [idx]);
+
+            impact = ptr[idx];
+    #endif
+        }
+    };
+
+    texture<int,  cudaTextureType2D, cudaReadModeElementType> thogluv;
+
+    template<bool isUp>
+    __device__ __forceinline__ float rescale(const Level& level, Node& node)
+    {
+        uchar4& scaledRect = node.rect;
+        float relScale = level.relScale;
+        float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
+
+        // rescale
+        scaledRect.x = __float2int_rn(relScale * scaledRect.x);
+        scaledRect.y = __float2int_rn(relScale * scaledRect.y);
+        scaledRect.z = __float2int_rn(relScale * scaledRect.z);
+        scaledRect.w = __float2int_rn(relScale * scaledRect.w);
+
+        float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
+
+        const float expected_new_area = farea * relScale * relScale;
+        float approx = (sarea == 0)? 1: __fdividef(sarea, expected_new_area);
+
+        float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6];
+
+        return rootThreshold;
+    }
+
+    template<>
+    __device__ __forceinline__ float rescale<true>(const Level& level, Node& node)
+    {
+        uchar4& scaledRect = node.rect;
+        float relScale = level.relScale;
+        float farea = scaledRect.z * scaledRect.w;
+
+        // rescale
+        scaledRect.x = __float2int_rn(relScale * scaledRect.x);
+        scaledRect.y = __float2int_rn(relScale * scaledRect.y);
+        scaledRect.z = __float2int_rn(relScale * scaledRect.z);
+        scaledRect.w = __float2int_rn(relScale * scaledRect.w);
+
+        float sarea = scaledRect.z * scaledRect.w;
+
+        const float expected_new_area = farea * relScale * relScale;
+        float approx = __fdividef(sarea, expected_new_area);
+
+        float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6];
+
+        return rootThreshold;
+    }
+
+    template<bool isUp>
+    __device__ __forceinline__ int get(int x, int y, uchar4 area)
+    {
+        int a = tex2D(thogluv, x + area.x, y + area.y);
+        int b = tex2D(thogluv, x + area.z, y + area.y);
+        int c = tex2D(thogluv, x + area.z, y + area.w);
+        int d = tex2D(thogluv, x + area.x, y + area.w);
+
+        return (a - b + c - d);
+    }
+
+    template<>
+    __device__ __forceinline__ int get<true>(int x, int y, uchar4 area)
+    {
+        x += area.x;
+        y += area.y;
+        int a = tex2D(thogluv, x, y);
+        int b = tex2D(thogluv, x + area.z, y);
+        int c = tex2D(thogluv, x + area.z, y + area.w);
+        int d = tex2D(thogluv, x, y + area.w);
+
+        return (a - b + c - d);
+    }
+
+    texture<float2,  cudaTextureType2D, cudaReadModeElementType> troi;
+
+template<typename Policy>
+template<bool isUp>
+__device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const
+{
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int x = blockIdx.x;
+
+    // load Lavel
+    __shared__ Level level;
+
+    // check POI
+    __shared__ volatile char roiCache[Policy::STA_Y];
+
+    if (!threadIdx.y && !threadIdx.x)
+        ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x);
+
+    __syncthreads();
+
+    if (!roiCache[threadIdx.y]) return;
+
+    if (!threadIdx.x)
+        level = levels[downscales + blockIdx.z];
+
+    if(x >= level.workRect.x || y >= level.workRect.y) return;
+
+    int st = level.octave * level.step;
+    const int stEnd = st + level.step;
+
+    const int hogluvStep = gridDim.y * Policy::STA_Y;
+    float confidence = 0.f;
+    for(; st < stEnd; st += Policy::WARP)
+    {
+        const int nId = (st + threadIdx.x) * 3;
+
+        Node node = nodes[nId];
+
+        float threshold = rescale<isUp>(level, node);
+        int sum = get<isUp>(x, y + (node.threshold >> 28) * hogluvStep, node.rect);
+
+        int next = 1 + (int)(sum >= threshold);
+
+        node = nodes[nId + next];
+        threshold = rescale<isUp>(level, node);
+        sum = get<isUp>(x, y + (node.threshold >> 28) * hogluvStep, node.rect);
+
+        const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
+        float impact = leaves[(st + threadIdx.x) * 4 + lShift];
+
+        PrefixSum<Policy>::apply(impact);
+        confidence += impact;
+
+    #if __CUDA_ARCH__ >= 120
+        if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048;
+    #endif
+    }
+
+    if(!threadIdx.x && st == stEnd &&  ((confidence - FLT_EPSILON) >= 0))
+    {
+        int idx = atomicInc(ctr, ndetections);
+        objects[idx] = Detection(__float2int_rn(x * Policy::SHRINKAGE),
+            __float2int_rn(y * Policy::SHRINKAGE), level.objSize.x, level.objSize.y, confidence);
+    }
+}
+
+template<typename Policy, bool isUp>
+__global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* objects, const uint n, uint* ctr, const int downs)
+{
+    invoker.template detect<isUp>(objects, n, ctr, downs);
+}
+
+template<typename Policy>
+void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
+    PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const
+{
+    int fw = roi.rows;
+    int fh = roi.cols;
+
+    dim3 grid(fw, fh / Policy::STA_Y, downscales);
+
+    uint* ctr = (uint*)(objects.ptr(0));
+    Detection* det = ((Detection*)objects.ptr(0)) + 1;
+    uint max_det = objects.cols / sizeof(Detection);
+
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
+    cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
+
+    cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<typename Policy::roi_type>();
+    cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / Policy::STA_Y, roi.rows, roi.step));
+
+    const CascadeInvoker<Policy> inv = *this;
+
+    soft_cascade<Policy, false><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, 0);
+    cudaSafeCall( cudaGetLastError());
+
+    grid = dim3(fw, fh / Policy::STA_Y, scales - downscales);
+    soft_cascade<Policy, true><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, downscales);
+
+    if (!stream)
+    {
+        cudaSafeCall( cudaGetLastError());
+        cudaSafeCall( cudaDeviceSynchronize());
+    }
+}
+
+template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
+    PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;
+
+}
+}}}
diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu
index 09187fd259..73cefa9a0b 100644
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -384,6 +384,88 @@ namespace cv { namespace gpu { namespace device
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
+
+        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> buffer, PtrStepSz<unsigned int> integral)
+        {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+            __shared__ unsigned int sums[32][9];
+
+            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int lane_id = tidx % 8;
+
+            if (tidx >= integral.cols)
+                return;
+
+            sums[threadIdx.x][threadIdx.y] = 0;
+            __syncthreads();
+
+            unsigned int stepSum = 0;
+
+            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+            {
+                unsigned int* p = buffer.ptr(y) + tidx;
+                unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
+
+                unsigned int sum = *p;
+
+                sums[threadIdx.x][threadIdx.y] = sum;
+                __syncthreads();
+
+                // place into SMEM
+                // shfl scan reduce the SMEM, reformating so the column
+                // sums are computed in a warp
+                // then read out properly
+                const int j = threadIdx.x % 8;
+                const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+                int partial_sum = sums[k][j];
+
+                for (int i = 1; i <= 8; i *= 2)
+                {
+                    int n = __shfl_up(partial_sum, i, 32);
+
+                    if (lane_id >= i)
+                        partial_sum += n;
+                }
+
+                sums[k][j] = partial_sum;
+                __syncthreads();
+
+                if (threadIdx.y > 0)
+                    sum += sums[threadIdx.x][threadIdx.y - 1];
+
+                sum += stepSum;
+                stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+                __syncthreads();
+
+                *dst = sum;
+            }
+        #endif
+        }
+
+        // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
+        void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
+            int blockStep, cudaStream_t stream)
+        {
+            {
+                const int block = blockStep;
+                const int grid = img.rows;
+
+                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, buffer);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(integral.cols, block.x), 1);
+
+                shfl_integral_vertical<<<grid, block, 0, stream>>>((PtrStepSz<uint>)buffer, integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        }
     }
 }}}
 
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index 516ea37a89..3ae8fdc7b6 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -76,7 +76,7 @@ namespace cv { namespace gpu { namespace device
             static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
             {
                 float angle = ::atan2f(y_data, x_data);
-                angle += (angle < 0) * 2.0 * CV_PI;
+                angle += (angle < 0) * 2.0f * CV_PI_F;
                 dst[y * dst_step + x] = scale * angle;
             }
         };
@@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace device
             grid.x = divUp(x.cols, threads.x);
             grid.y = divUp(x.rows, threads.y);
 
-            const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
+            const float scale = angleInDegrees ? (180.0f / CV_PI_F) : 1.f;
 
             cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
                 x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
@@ -190,7 +190,7 @@ namespace cv { namespace gpu { namespace device
             grid.x = divUp(mag.cols, threads.x);
             grid.y = divUp(mag.rows, threads.y);
 
-            const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
+            const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
 
             polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
                 angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
@@ -214,4 +214,4 @@ namespace cv { namespace gpu { namespace device
     } // namespace mathfunc
 }}} // namespace cv { namespace gpu { namespace device
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu
index 5cda3dab39..d34b38a250 100644
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -42,2062 +42,1269 @@
 
 #if !defined CUDA_DISABLER
 
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/type_traits.hpp"
 
-namespace cv { namespace gpu { namespace device
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace detail
 {
-    namespace matrix_reductions
+    template <int cn> struct Unroll;
+    template <> struct Unroll<1>
     {
-        // Performs reduction in shared memory
-        template <int size, typename T>
-        __device__ void sumInSmem(volatile T* data, const uint tid)
+        template <int BLOCK_SIZE, typename R>
+        static __device__ __forceinline__ volatile R* smem_tuple(R* smem)
         {
-            T sum = data[tid];
-
-            if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
-            if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
-            if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
-
-            if (tid < 32)
-            {
-                if (size >= 64) data[tid] = sum = sum + data[tid + 32];
-                if (size >= 32) data[tid] = sum = sum + data[tid + 16];
-                if (size >= 16) data[tid] = sum = sum + data[tid + 8];
-                if (size >= 8) data[tid] = sum = sum + data[tid + 4];
-                if (size >= 4) data[tid] = sum = sum + data[tid + 2];
-                if (size >= 2) data[tid] = sum = sum + data[tid + 1];
-            }
+            return smem;
         }
 
-        struct Mask8U
+        template <typename R>
+        static __device__ __forceinline__ R& tie(R& val)
         {
-            explicit Mask8U(PtrStepb mask_): mask(mask_) {}
+            return val;
+        }
 
-            __device__ __forceinline__ bool operator()(int y, int x) const
-            {
-                return mask.ptr(y)[x];
-            }
-
-            PtrStepb mask;
-        };
-
-        struct MaskTrue
+        template <class Op>
+        static __device__ __forceinline__ const Op& op(const Op& op)
         {
-            __device__ __forceinline__ bool operator()(int y, int x) const
-            {
-                return true;
-            }
-            __device__ __forceinline__ MaskTrue(){}
-            __device__ __forceinline__ MaskTrue(const MaskTrue& mask_){}
-        };
-
-        //////////////////////////////////////////////////////////////////////////////
-        // Min max
-
-        // To avoid shared bank conflicts we convert each value into value of
-        // appropriate type (32 bits minimum)
-        template <typename T> struct MinMaxTypeTraits {};
-        template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
-        template <> struct MinMaxTypeTraits<char> { typedef int best_type; };
-        template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
-        template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
-        template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
-        template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
-        template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
-
-        namespace minmax
+            return op;
+        }
+    };
+    template <> struct Unroll<2>
+    {
+        template <int BLOCK_SIZE, typename R>
+        static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*> smem_tuple(R* smem)
         {
-            __constant__ int ctwidth;
-            __constant__ int ctheight;
+            return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
+        }
 
-            // Global counter of blocks finished its work
-            __device__ uint blocks_finished = 0;
-
-
-            // Estimates good thread configuration
-            //  - threads variable satisfies to threads.x * threads.y == 256
-            void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
-            {
-                threads = dim3(32, 8);
-                grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
-                grid.x = std::min(grid.x, threads.x);
-                grid.y = std::min(grid.y, threads.y);
-            }
-
-
-            // Returns required buffer sizes
-            void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
-            {
-                dim3 threads, grid;
-                estimateThreadCfg(cols, rows, threads, grid);
-                bufcols = grid.x * grid.y * elem_size;
-                bufrows = 2;
-            }
-
-
-            // Estimates device constants which are used in the kernels using specified thread configuration
-            void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-            {
-                int twidth = divUp(divUp(cols, grid.x), threads.x);
-                int theight = divUp(divUp(rows, grid.y), threads.y);
-                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
-                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
-            }
-
-
-            // Does min and max in shared memory
-            template <typename T>
-            __device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
-            {
-                minval[tid] = ::min(minval[tid], minval[tid + offset]);
-                maxval[tid] = ::max(maxval[tid], maxval[tid + offset]);
-            }
-
-
-            template <int size, typename T>
-            __device__ void findMinMaxInSmem(volatile T* minval, volatile T* maxval, const uint tid)
-            {
-                if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
-                if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); }  __syncthreads(); }
-                if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); }
-
-                if (tid < 32)
-                {
-                    if (size >= 64) merge(tid, 32, minval, maxval);
-                    if (size >= 32) merge(tid, 16, minval, maxval);
-                    if (size >= 16) merge(tid, 8, minval, maxval);
-                    if (size >= 8) merge(tid, 4, minval, maxval);
-                    if (size >= 4) merge(tid, 2, minval, maxval);
-                    if (size >= 2) merge(tid, 1, minval, maxval);
-                }
-            }
-
-
-            template <int nthreads, typename T, typename Mask>
-            __global__ void minMaxKernel(const PtrStepSzb src, Mask mask, T* minval, T* maxval)
-            {
-                typedef typename MinMaxTypeTraits<T>::best_type best_type;
-                __shared__ best_type sminval[nthreads];
-                __shared__ best_type smaxval[nthreads];
-
-                uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-                uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-                uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-                T mymin = numeric_limits<T>::max();
-                T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();
-                uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
-                uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
-                for (uint y = y0; y < y_end; y += blockDim.y)
-                {
-                    const T* src_row = (const T*)src.ptr(y);
-                    for (uint x = x0; x < x_end; x += blockDim.x)
-                    {
-                        T val = src_row[x];
-                        if (mask(y, x))
-                        {
-                            mymin = ::min(mymin, val);
-                            mymax = ::max(mymax, val);
-                        }
-                    }
-                }
-
-                sminval[tid] = mymin;
-                smaxval[tid] = mymax;
-                __syncthreads();
-
-                findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
-
-                if (tid == 0)
-                {
-                    minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
-                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-                }
-
-            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-                __shared__ bool is_last;
-
-                if (tid == 0)
-                {
-                    minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
-                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-                    __threadfence();
-
-                    uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-                    is_last = ticket == gridDim.x * gridDim.y - 1;
-                }
-
-                __syncthreads();
-
-                if (is_last)
-                {
-                    uint idx = ::min(tid, gridDim.x * gridDim.y - 1);
-
-                    sminval[tid] = minval[idx];
-                    smaxval[tid] = maxval[idx];
-                    __syncthreads();
-
-                    findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
-
-                    if (tid == 0)
-                    {
-                        minval[0] = (T)sminval[0];
-                        maxval[0] = (T)smaxval[0];
-                        blocks_finished = 0;
-                    }
-                }
-            #else
-                if (tid == 0)
-                {
-                    minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
-                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-                }
-            #endif
-            }
-
-
-            template <typename T>
-            void minMaxMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
-            {
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                T* minval_buf = (T*)buf.ptr(0);
-                T* maxval_buf = (T*)buf.ptr(1);
-
-                minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                T minval_, maxval_;
-                cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-                cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-                *minval = minval_;
-                *maxval = maxval_;
-            }
-
-            template void minMaxMaskCaller<uchar>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskCaller<char>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskCaller<ushort>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskCaller<short>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskCaller<int>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskCaller<float>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskCaller<double>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-
-
-            template <typename T>
-            void minMaxCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf)
-            {
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                T* minval_buf = (T*)buf.ptr(0);
-                T* maxval_buf = (T*)buf.ptr(1);
-
-                minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                T minval_, maxval_;
-                cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-                cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-                *minval = minval_;
-                *maxval = maxval_;
-            }
-
-            template void minMaxCaller<uchar>(const PtrStepSzb, double*, double*, PtrStepb);
-            template void minMaxCaller<char>(const PtrStepSzb, double*, double*, PtrStepb);
-            template void minMaxCaller<ushort>(const PtrStepSzb, double*, double*, PtrStepb);
-            template void minMaxCaller<short>(const PtrStepSzb, double*, double*, PtrStepb);
-            template void minMaxCaller<int>(const PtrStepSzb, double*, double*, PtrStepb);
-            template void minMaxCaller<float>(const PtrStepSzb, double*,double*, PtrStepb);
-            template void minMaxCaller<double>(const PtrStepSzb, double*, double*, PtrStepb);
-
-
-            template <int nthreads, typename T>
-            __global__ void minMaxPass2Kernel(T* minval, T* maxval, int size)
-            {
-                typedef typename MinMaxTypeTraits<T>::best_type best_type;
-                __shared__ best_type sminval[nthreads];
-                __shared__ best_type smaxval[nthreads];
-
-                uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-                uint idx = ::min(tid, size - 1);
-
-                sminval[tid] = minval[idx];
-                smaxval[tid] = maxval[idx];
-                __syncthreads();
-
-                findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
-
-                if (tid == 0)
-                {
-                    minval[0] = (T)sminval[0];
-                    maxval[0] = (T)smaxval[0];
-                }
-            }
-
-
-            template <typename T>
-            void minMaxMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
-            {
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                T* minval_buf = (T*)buf.ptr(0);
-                T* maxval_buf = (T*)buf.ptr(1);
-
-                minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
-                cudaSafeCall( cudaGetLastError() );
-                minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall(cudaDeviceSynchronize());
-
-                T minval_, maxval_;
-                cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-                cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-                *minval = minval_;
-                *maxval = maxval_;
-            }
-
-            template void minMaxMaskMultipassCaller<uchar>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskMultipassCaller<char>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskMultipassCaller<ushort>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskMultipassCaller<short>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskMultipassCaller<int>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-            template void minMaxMaskMultipassCaller<float>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-
-
-            template <typename T>
-            void minMaxMultipassCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf)
-            {
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                T* minval_buf = (T*)buf.ptr(0);
-                T* maxval_buf = (T*)buf.ptr(1);
-
-                minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
-                cudaSafeCall( cudaGetLastError() );
-                minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                T minval_, maxval_;
-                cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-                cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-                *minval = minval_;
-                *maxval = maxval_;
-            }
-
-            template void minMaxMultipassCaller<uchar>(const PtrStepSzb, double*, double*, PtrStepb);
-            template void minMaxMultipassCaller<char>(const PtrStepSzb, double*, double*, PtrStepb);
-            template void minMaxMultipassCaller<ushort>(const PtrStepSzb, double*, double*, PtrStepb);
-            template void minMaxMultipassCaller<short>(const PtrStepSzb, double*, double*, PtrStepb);
-            template void minMaxMultipassCaller<int>(const PtrStepSzb, double*, double*, PtrStepb);
-            template void minMaxMultipassCaller<float>(const PtrStepSzb, double*, double*, PtrStepb);
-        } // namespace minmax
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // minMaxLoc
-
-        namespace minmaxloc
+        template <typename R>
+        static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
         {
-            __constant__ int ctwidth;
-            __constant__ int ctheight;
+            return thrust::tie(val.x, val.y);
+        }
 
-            // Global counter of blocks finished its work
-            __device__ uint blocks_finished = 0;
-
-
-            // Estimates good thread configuration
-            //  - threads variable satisfies to threads.x * threads.y == 256
-            void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
-            {
-                threads = dim3(32, 8);
-                grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
-                grid.x = std::min(grid.x, threads.x);
-                grid.y = std::min(grid.y, threads.y);
-            }
-
-
-            // Returns required buffer sizes
-            void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
-                                    int& b1rows, int& b2cols, int& b2rows)
-            {
-                dim3 threads, grid;
-                estimateThreadCfg(cols, rows, threads, grid);
-                b1cols = grid.x * grid.y * elem_size; // For values
-                b1rows = 2;
-                b2cols = grid.x * grid.y * sizeof(int); // For locations
-                b2rows = 2;
-            }
-
-
-            // Estimates device constants which are used in the kernels using specified thread configuration
-            void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-            {
-                int twidth = divUp(divUp(cols, grid.x), threads.x);
-                int theight = divUp(divUp(rows, grid.y), threads.y);
-                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
-                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
-            }
-
-
-            template <typename T>
-            __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval,
-                                  volatile uint* minloc, volatile uint* maxloc)
-            {
-                T val = minval[tid + offset];
-                if (val < minval[tid])
-                {
-                    minval[tid] = val;
-                    minloc[tid] = minloc[tid + offset];
-                }
-                val = maxval[tid + offset];
-                if (val > maxval[tid])
-                {
-                    maxval[tid] = val;
-                    maxloc[tid] = maxloc[tid + offset];
-                }
-            }
-
-
-            template <int size, typename T>
-            __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
-                                                volatile uint* maxloc, const uint tid)
-            {
-                if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
-                if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); }  __syncthreads(); }
-                if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); }
-
-                if (tid < 32)
-                {
-                    if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc);
-                    if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc);
-                    if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc);
-                    if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc);
-                    if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc);
-                    if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc);
-                }
-            }
-
-
-            template <int nthreads, typename T, typename Mask>
-            __global__ void minMaxLocKernel(const PtrStepSzb src, Mask mask, T* minval, T* maxval,
-                                            uint* minloc, uint* maxloc)
-            {
-                typedef typename MinMaxTypeTraits<T>::best_type best_type;
-                __shared__ best_type sminval[nthreads];
-                __shared__ best_type smaxval[nthreads];
-                __shared__ uint sminloc[nthreads];
-                __shared__ uint smaxloc[nthreads];
-
-                uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-                uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-                uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-                T mymin = numeric_limits<T>::max();
-                T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();
-                uint myminloc = 0;
-                uint mymaxloc = 0;
-                uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
-                uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
-
-                for (uint y = y0; y < y_end; y += blockDim.y)
-                {
-                    const T* ptr = (const T*)src.ptr(y);
-                    for (uint x = x0; x < x_end; x += blockDim.x)
-                    {
-                        if (mask(y, x))
-                        {
-                            T val = ptr[x];
-                            if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; }
-                            if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; }
-                        }
-                    }
-                }
-
-                sminval[tid] = mymin;
-                smaxval[tid] = mymax;
-                sminloc[tid] = myminloc;
-                smaxloc[tid] = mymaxloc;
-                __syncthreads();
-
-                findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
-
-            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-                __shared__ bool is_last;
-
-                if (tid == 0)
-                {
-                    minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
-                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-                    minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
-                    maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
-                    __threadfence();
-
-                    uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-                    is_last = ticket == gridDim.x * gridDim.y - 1;
-                }
-
-                __syncthreads();
-
-                if (is_last)
-                {
-                    uint idx = ::min(tid, gridDim.x * gridDim.y - 1);
-
-                    sminval[tid] = minval[idx];
-                    smaxval[tid] = maxval[idx];
-                    sminloc[tid] = minloc[idx];
-                    smaxloc[tid] = maxloc[idx];
-                    __syncthreads();
-
-                    findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
-
-                    if (tid == 0)
-                    {
-                        minval[0] = (T)sminval[0];
-                        maxval[0] = (T)smaxval[0];
-                        minloc[0] = sminloc[0];
-                        maxloc[0] = smaxloc[0];
-                        blocks_finished = 0;
-                    }
-                }
-            #else
-                if (tid == 0)
-                {
-                    minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
-                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-                    minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
-                    maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
-                }
-            #endif
-            }
-
-
-            template <typename T>
-            void minMaxLocMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
-                                     int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
-            {
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                T* minval_buf = (T*)valbuf.ptr(0);
-                T* maxval_buf = (T*)valbuf.ptr(1);
-                uint* minloc_buf = (uint*)locbuf.ptr(0);
-                uint* maxloc_buf = (uint*)locbuf.ptr(1);
-
-                minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
-                                                                   minloc_buf, maxloc_buf);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                T minval_, maxval_;
-                cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-                cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-                *minval = minval_;
-                *maxval = maxval_;
-
-                uint minloc_, maxloc_;
-                cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
-                cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
-                minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
-                maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
-            }
-
-            template void minMaxLocMaskCaller<uchar>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskCaller<char>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskCaller<ushort>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskCaller<short>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskCaller<int>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskCaller<float>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskCaller<double>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-
-
-            template <typename T>
-            void minMaxLocCaller(const PtrStepSzb src, double* minval, double* maxval,
-                                 int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
-            {
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                T* minval_buf = (T*)valbuf.ptr(0);
-                T* maxval_buf = (T*)valbuf.ptr(1);
-                uint* minloc_buf = (uint*)locbuf.ptr(0);
-                uint* maxloc_buf = (uint*)locbuf.ptr(1);
-
-                minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
-                                                                     minloc_buf, maxloc_buf);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                T minval_, maxval_;
-                cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-                cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-                *minval = minval_;
-                *maxval = maxval_;
-
-                uint minloc_, maxloc_;
-                cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-                cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-                minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
-                maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
-            }
-
-            template void minMaxLocCaller<uchar>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocCaller<char>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocCaller<ushort>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocCaller<short>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocCaller<int>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocCaller<float>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocCaller<double>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-
-
-            // This kernel will be used only when compute capability is 1.0
-            template <int nthreads, typename T>
-            __global__ void minMaxLocPass2Kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
-            {
-                typedef typename MinMaxTypeTraits<T>::best_type best_type;
-                __shared__ best_type sminval[nthreads];
-                __shared__ best_type smaxval[nthreads];
-                __shared__ uint sminloc[nthreads];
-                __shared__ uint smaxloc[nthreads];
-
-                uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-                uint idx = ::min(tid, size - 1);
-
-                sminval[tid] = minval[idx];
-                smaxval[tid] = maxval[idx];
-                sminloc[tid] = minloc[idx];
-                smaxloc[tid] = maxloc[idx];
-                __syncthreads();
-
-                findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
-
-                if (tid == 0)
-                {
-                    minval[0] = (T)sminval[0];
-                    maxval[0] = (T)smaxval[0];
-                    minloc[0] = sminloc[0];
-                    maxloc[0] = smaxloc[0];
-                }
-            }
-
-
-            template <typename T>
-            void minMaxLocMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
-                                              int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
-            {
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                T* minval_buf = (T*)valbuf.ptr(0);
-                T* maxval_buf = (T*)valbuf.ptr(1);
-                uint* minloc_buf = (uint*)locbuf.ptr(0);
-                uint* maxloc_buf = (uint*)locbuf.ptr(1);
-
-                minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
-                                                                   minloc_buf, maxloc_buf);
-                cudaSafeCall( cudaGetLastError() );
-                minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                T minval_, maxval_;
-                cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-                cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-                *minval = minval_;
-                *maxval = maxval_;
-
-                uint minloc_, maxloc_;
-                cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-                cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-                minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
-                maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
-            }
-
-            template void minMaxLocMaskMultipassCaller<uchar>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskMultipassCaller<char>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskMultipassCaller<ushort>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskMultipassCaller<short>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskMultipassCaller<int>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMaskMultipassCaller<float>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-
-
-            template <typename T>
-            void minMaxLocMultipassCaller(const PtrStepSzb src, double* minval, double* maxval,
-                                          int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
-            {
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                T* minval_buf = (T*)valbuf.ptr(0);
-                T* maxval_buf = (T*)valbuf.ptr(1);
-                uint* minloc_buf = (uint*)locbuf.ptr(0);
-                uint* maxloc_buf = (uint*)locbuf.ptr(1);
-
-                minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
-                                                                     minloc_buf, maxloc_buf);
-                cudaSafeCall( cudaGetLastError() );
-                minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                T minval_, maxval_;
-                cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-                cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-                *minval = minval_;
-                *maxval = maxval_;
-
-                uint minloc_, maxloc_;
-                cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-                cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-                minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
-                maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
-            }
-
-            template void minMaxLocMultipassCaller<uchar>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMultipassCaller<char>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMultipassCaller<ushort>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMultipassCaller<short>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMultipassCaller<int>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-            template void minMaxLocMultipassCaller<float>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-        } // namespace minmaxloc
-
-        //////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // countNonZero
-
-        namespace countnonzero
+        template <class Op>
+        static __device__ __forceinline__ const thrust::tuple<Op, Op> op(const Op& op)
         {
-            __constant__ int ctwidth;
-            __constant__ int ctheight;
-
-            __device__ uint blocks_finished = 0;
-
-            void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
-            {
-                threads = dim3(32, 8);
-                grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
-                grid.x = std::min(grid.x, threads.x);
-                grid.y = std::min(grid.y, threads.y);
-            }
-
-
-            void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows)
-            {
-                dim3 threads, grid;
-                estimateThreadCfg(cols, rows, threads, grid);
-                bufcols = grid.x * grid.y * sizeof(int);
-                bufrows = 1;
-            }
-
-
-            void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-            {
-                int twidth = divUp(divUp(cols, grid.x), threads.x);
-                int theight = divUp(divUp(rows, grid.y), threads.y);
-                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
-                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
-            }
-
-
-            template <int nthreads, typename T>
-            __global__ void countNonZeroKernel(const PtrStepSzb src, volatile uint* count)
-            {
-                __shared__ uint scount[nthreads];
-
-                uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-                uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-                uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-                uint cnt = 0;
-                for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
-                {
-                    const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
-                    for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-                        cnt += ptr[x0 + x * blockDim.x] != 0;
-                }
-
-                scount[tid] = cnt;
-                __syncthreads();
-
-                sumInSmem<nthreads, uint>(scount, tid);
-
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-                __shared__ bool is_last;
-
-                if (tid == 0)
-                {
-                    count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
-                    __threadfence();
-
-                    uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-                    is_last = ticket == gridDim.x * gridDim.y - 1;
-                }
-
-                __syncthreads();
-
-                if (is_last)
-                {
-                    scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
-                    __syncthreads();
-
-                    sumInSmem<nthreads, uint>(scount, tid);
-
-                    if (tid == 0)
-                    {
-                        count[0] = scount[0];
-                        blocks_finished = 0;
-                    }
-                }
-            #else
-                if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
-            #endif
-            }
-
-
-            template <typename T>
-            int countNonZeroCaller(const PtrStepSzb src, PtrStepb buf)
-            {
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                uint* count_buf = (uint*)buf.ptr(0);
-
-                countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                uint count;
-                cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
-
-                return count;
-            }
-
-            template int countNonZeroCaller<uchar>(const PtrStepSzb, PtrStepb);
-            template int countNonZeroCaller<char>(const PtrStepSzb, PtrStepb);
-            template int countNonZeroCaller<ushort>(const PtrStepSzb, PtrStepb);
-            template int countNonZeroCaller<short>(const PtrStepSzb, PtrStepb);
-            template int countNonZeroCaller<int>(const PtrStepSzb, PtrStepb);
-            template int countNonZeroCaller<float>(const PtrStepSzb, PtrStepb);
-            template int countNonZeroCaller<double>(const PtrStepSzb, PtrStepb);
-
-
-            template <int nthreads, typename T>
-            __global__ void countNonZeroPass2Kernel(uint* count, int size)
-            {
-                __shared__ uint scount[nthreads];
-                uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-                scount[tid] = tid < size ? count[tid] : 0;
-                __syncthreads();
-
-                sumInSmem<nthreads, uint>(scount, tid);
-
-                if (tid == 0)
-                    count[0] = scount[0];
-            }
-
-
-            template <typename T>
-            int countNonZeroMultipassCaller(const PtrStepSzb src, PtrStepb buf)
-            {
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                uint* count_buf = (uint*)buf.ptr(0);
-
-                countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
-                cudaSafeCall( cudaGetLastError() );
-                countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                uint count;
-                cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
-
-                return count;
-            }
-
-            template int countNonZeroMultipassCaller<uchar>(const PtrStepSzb, PtrStepb);
-            template int countNonZeroMultipassCaller<char>(const PtrStepSzb, PtrStepb);
-            template int countNonZeroMultipassCaller<ushort>(const PtrStepSzb, PtrStepb);
-            template int countNonZeroMultipassCaller<short>(const PtrStepSzb, PtrStepb);
-            template int countNonZeroMultipassCaller<int>(const PtrStepSzb, PtrStepb);
-            template int countNonZeroMultipassCaller<float>(const PtrStepSzb, PtrStepb);
-
-        } // namespace countnonzero
-
-
-        //////////////////////////////////////////////////////////////////////////
-        // Sum
-
-        namespace sum
+            return thrust::make_tuple(op, op);
+        }
+    };
+    template <> struct Unroll<3>
+    {
+        template <int BLOCK_SIZE, typename R>
+        static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
         {
-            template <typename T> struct SumType {};
-            template <> struct SumType<uchar> { typedef uint R; };
-            template <> struct SumType<char> { typedef int R; };
-            template <> struct SumType<ushort> { typedef uint R; };
-            template <> struct SumType<short> { typedef int R; };
-            template <> struct SumType<int> { typedef int R; };
-            template <> struct SumType<float> { typedef float R; };
-            template <> struct SumType<double> { typedef double R; };
+            return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
+        }
 
-            template <typename R>
-            struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };
-
-            template <typename R>
-            struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } };
-
-            template <>
-            struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };
-
-            template <typename R>
-            struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } };
-
-            __constant__ int ctwidth;
-            __constant__ int ctheight;
-            __device__ uint blocks_finished = 0;
-
-            const int threads_x = 32;
-            const int threads_y = 8;
-
-            void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
-            {
-                threads = dim3(threads_x, threads_y);
-                grid = dim3(divUp(cols, threads.x * threads.y),
-                            divUp(rows, threads.y * threads.x));
-                grid.x = std::min(grid.x, threads.x);
-                grid.y = std::min(grid.y, threads.y);
-            }
-
-
-            void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows)
-            {
-                dim3 threads, grid;
-                estimateThreadCfg(cols, rows, threads, grid);
-                bufcols = grid.x * grid.y * sizeof(double) * cn;
-                bufrows = 1;
-            }
-
-
-            void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-            {
-                int twidth = divUp(divUp(cols, grid.x), threads.x);
-                int theight = divUp(divUp(rows, grid.y), threads.y);
-                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
-                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
-            }
-
-            template <typename T, typename R, typename Op, int nthreads>
-            __global__ void sumKernel(const PtrStepSzb src, R* result)
-            {
-                __shared__ R smem[nthreads];
-
-                const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-                const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-                const int bid = blockIdx.y * gridDim.x + blockIdx.x;
-
-                R sum = 0;
-                for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
-                {
-                    const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
-                    for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-                        sum += Op::call(ptr[x0 + x * blockDim.x]);
-                }
-
-                smem[tid] = sum;
-                __syncthreads();
-
-                sumInSmem<nthreads, R>(smem, tid);
-
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-                __shared__ bool is_last;
-
-                if (tid == 0)
-                {
-                    result[bid] = smem[0];
-                    __threadfence();
-
-                    uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-                    is_last = (ticket == gridDim.x * gridDim.y - 1);
-                }
-
-                __syncthreads();
-
-                if (is_last)
-                {
-                    smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
-                    __syncthreads();
-
-                    sumInSmem<nthreads, R>(smem, tid);
-
-                    if (tid == 0)
-                    {
-                        result[0] = smem[0];
-                        blocks_finished = 0;
-                    }
-                }
-            #else
-                if (tid == 0) result[bid] = smem[0];
-            #endif
-            }
-
-
-            template <typename T, typename R, int nthreads>
-            __global__ void sumPass2Kernel(R* result, int size)
-            {
-                __shared__ R smem[nthreads];
-                int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-                smem[tid] = tid < size ? result[tid] : 0;
-                __syncthreads();
-
-                sumInSmem<nthreads, R>(smem, tid);
-
-                if (tid == 0)
-                    result[0] = smem[0];
-            }
-
-
-            template <typename T, typename R, typename Op, int nthreads>
-            __global__ void sumKernel_C2(const PtrStepSzb src, typename TypeVec<R, 2>::vec_type* result)
-            {
-                typedef typename TypeVec<T, 2>::vec_type SrcType;
-                typedef typename TypeVec<R, 2>::vec_type DstType;
-
-                __shared__ R smem[nthreads * 2];
-
-                const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-                const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-                const int bid = blockIdx.y * gridDim.x + blockIdx.x;
-
-                SrcType val;
-                DstType sum = VecTraits<DstType>::all(0);
-                for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
-                {
-                    const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
-                    for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-                    {
-                        val = ptr[x0 + x * blockDim.x];
-                        sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y));
-                    }
-                }
-
-                smem[tid] = sum.x;
-                smem[tid + nthreads] = sum.y;
-                __syncthreads();
-
-                sumInSmem<nthreads, R>(smem, tid);
-                sumInSmem<nthreads, R>(smem + nthreads, tid);
-
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-                __shared__ bool is_last;
-
-                if (tid == 0)
-                {
-                    DstType res;
-                    res.x = smem[0];
-                    res.y = smem[nthreads];
-                    result[bid] = res;
-                    __threadfence();
-
-                    uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-                    is_last = (ticket == gridDim.x * gridDim.y - 1);
-                }
-
-                __syncthreads();
-
-                if (is_last)
-                {
-                    DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
-                    smem[tid] = res.x;
-                    smem[tid + nthreads] = res.y;
-                    __syncthreads();
-
-                    sumInSmem<nthreads, R>(smem, tid);
-                    sumInSmem<nthreads, R>(smem + nthreads, tid);
-
-                    if (tid == 0)
-                    {
-                        res.x = smem[0];
-                        res.y = smem[nthreads];
-                        result[0] = res;
-                        blocks_finished = 0;
-                    }
-                }
-            #else
-                if (tid == 0)
-                {
-                    DstType res;
-                    res.x = smem[0];
-                    res.y = smem[nthreads];
-                    result[bid] = res;
-                }
-            #endif
-            }
-
-
-            template <typename T, typename R, int nthreads>
-            __global__ void sumPass2Kernel_C2(typename TypeVec<R, 2>::vec_type* result, int size)
-            {
-                typedef typename TypeVec<R, 2>::vec_type DstType;
-
-                __shared__ R smem[nthreads * 2];
-
-                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-                DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);
-                smem[tid] = res.x;
-                smem[tid + nthreads] = res.y;
-                __syncthreads();
-
-                sumInSmem<nthreads, R>(smem, tid);
-                sumInSmem<nthreads, R>(smem + nthreads, tid);
-
-                if (tid == 0)
-                {
-                    res.x = smem[0];
-                    res.y = smem[nthreads];
-                    result[0] = res;
-                }
-            }
-
-
-            template <typename T, typename R, typename Op, int nthreads>
-            __global__ void sumKernel_C3(const PtrStepSzb src, typename TypeVec<R, 3>::vec_type* result)
-            {
-                typedef typename TypeVec<T, 3>::vec_type SrcType;
-                typedef typename TypeVec<R, 3>::vec_type DstType;
-
-                __shared__ R smem[nthreads * 3];
-
-                const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-                const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-                const int bid = blockIdx.y * gridDim.x + blockIdx.x;
-
-                SrcType val;
-                DstType sum = VecTraits<DstType>::all(0);
-                for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
-                {
-                    const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
-                    for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-                    {
-                        val = ptr[x0 + x * blockDim.x];
-                        sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), Op::call(val.z));
-                    }
-                }
-
-                smem[tid] = sum.x;
-                smem[tid + nthreads] = sum.y;
-                smem[tid + 2 * nthreads] = sum.z;
-                __syncthreads();
-
-                sumInSmem<nthreads, R>(smem, tid);
-                sumInSmem<nthreads, R>(smem + nthreads, tid);
-                sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
-
-            #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
-                __shared__ bool is_last;
-
-                if (tid == 0)
-                {
-                    DstType res;
-                    res.x = smem[0];
-                    res.y = smem[nthreads];
-                    res.z = smem[2 * nthreads];
-                    result[bid] = res;
-                    __threadfence();
-
-                    uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-                    is_last = (ticket == gridDim.x * gridDim.y - 1);
-                }
-
-                __syncthreads();
-
-                if (is_last)
-                {
-                    DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
-                    smem[tid] = res.x;
-                    smem[tid + nthreads] = res.y;
-                    smem[tid + 2 * nthreads] = res.z;
-                    __syncthreads();
-
-                    sumInSmem<nthreads, R>(smem, tid);
-                    sumInSmem<nthreads, R>(smem + nthreads, tid);
-                    sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
-
-                    if (tid == 0)
-                    {
-                        res.x = smem[0];
-                        res.y = smem[nthreads];
-                        res.z = smem[2 * nthreads];
-                        result[0] = res;
-                        blocks_finished = 0;
-                    }
-                }
-            #else
-                if (tid == 0)
-                {
-                    DstType res;
-                    res.x = smem[0];
-                    res.y = smem[nthreads];
-                    res.z = smem[2 * nthreads];
-                    result[bid] = res;
-                }
-            #endif
-            }
-
-
-            template <typename T, typename R, int nthreads>
-            __global__ void sumPass2Kernel_C3(typename TypeVec<R, 3>::vec_type* result, int size)
-            {
-                typedef typename TypeVec<R, 3>::vec_type DstType;
-
-                __shared__ R smem[nthreads * 3];
-
-                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-                DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);
-                smem[tid] = res.x;
-                smem[tid + nthreads] = res.y;
-                smem[tid + 2 * nthreads] = res.z;
-                __syncthreads();
-
-                sumInSmem<nthreads, R>(smem, tid);
-                sumInSmem<nthreads, R>(smem + nthreads, tid);
-                sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
-
-                if (tid == 0)
-                {
-                    res.x = smem[0];
-                    res.y = smem[nthreads];
-                    res.z = smem[2 * nthreads];
-                    result[0] = res;
-                }
-            }
-
-            template <typename T, typename R, typename Op, int nthreads>
-            __global__ void sumKernel_C4(const PtrStepSzb src, typename TypeVec<R, 4>::vec_type* result)
-            {
-                typedef typename TypeVec<T, 4>::vec_type SrcType;
-                typedef typename TypeVec<R, 4>::vec_type DstType;
-
-                __shared__ R smem[nthreads * 4];
-
-                const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-                const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-                const int bid = blockIdx.y * gridDim.x + blockIdx.x;
-
-                SrcType val;
-                DstType sum = VecTraits<DstType>::all(0);
-                for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
-                {
-                    const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
-                    for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-                    {
-                        val = ptr[x0 + x * blockDim.x];
-                        sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y),
-                                                             Op::call(val.z), Op::call(val.w));
-                    }
-                }
-
-                smem[tid] = sum.x;
-                smem[tid + nthreads] = sum.y;
-                smem[tid + 2 * nthreads] = sum.z;
-                smem[tid + 3 * nthreads] = sum.w;
-                __syncthreads();
-
-                sumInSmem<nthreads, R>(smem, tid);
-                sumInSmem<nthreads, R>(smem + nthreads, tid);
-                sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
-                sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
-
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-                __shared__ bool is_last;
-
-                if (tid == 0)
-                {
-                    DstType res;
-                    res.x = smem[0];
-                    res.y = smem[nthreads];
-                    res.z = smem[2 * nthreads];
-                    res.w = smem[3 * nthreads];
-                    result[bid] = res;
-                    __threadfence();
-
-                    uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-                    is_last = (ticket == gridDim.x * gridDim.y - 1);
-                }
-
-                __syncthreads();
-
-                if (is_last)
-                {
-                    DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
-                    smem[tid] = res.x;
-                    smem[tid + nthreads] = res.y;
-                    smem[tid + 2 * nthreads] = res.z;
-                    smem[tid + 3 * nthreads] = res.w;
-                    __syncthreads();
-
-                    sumInSmem<nthreads, R>(smem, tid);
-                    sumInSmem<nthreads, R>(smem + nthreads, tid);
-                    sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
-                    sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
-
-                    if (tid == 0)
-                    {
-                        res.x = smem[0];
-                        res.y = smem[nthreads];
-                        res.z = smem[2 * nthreads];
-                        res.w = smem[3 * nthreads];
-                        result[0] = res;
-                        blocks_finished = 0;
-                    }
-                }
-            #else
-                if (tid == 0)
-                {
-                    DstType res;
-                    res.x = smem[0];
-                    res.y = smem[nthreads];
-                    res.z = smem[2 * nthreads];
-                    res.w = smem[3 * nthreads];
-                    result[bid] = res;
-                }
-            #endif
-            }
-
-
-            template <typename T, typename R, int nthreads>
-            __global__ void sumPass2Kernel_C4(typename TypeVec<R, 4>::vec_type* result, int size)
-            {
-                typedef typename TypeVec<R, 4>::vec_type DstType;
-
-                __shared__ R smem[nthreads * 4];
-
-                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-                DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);
-                smem[tid] = res.x;
-                smem[tid + nthreads] = res.y;
-                smem[tid + 2 * nthreads] = res.z;
-                smem[tid + 3 * nthreads] = res.w;
-                __syncthreads();
-
-                sumInSmem<nthreads, R>(smem, tid);
-                sumInSmem<nthreads, R>(smem + nthreads, tid);
-                sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
-                sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
-
-                if (tid == 0)
-                {
-                    res.x = smem[0];
-                    res.y = smem[nthreads];
-                    res.z = smem[2 * nthreads];
-                    res.w = smem[3 * nthreads];
-                    result[0] = res;
-                }
-            }
-
-            template <typename T>
-            void sumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
-            {
-                typedef typename SumType<T>::R R;
-
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                switch (cn)
-                {
-                case 1:
-                    sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
-                    cudaSafeCall( cudaGetLastError() );
-
-                    sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                            (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    break;
-                case 2:
-                    sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
-                    cudaSafeCall( cudaGetLastError() );
-
-                    sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                            (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    break;
-                case 3:
-                    sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
-                    cudaSafeCall( cudaGetLastError() );
-
-                    sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                            (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    break;
-                case 4:
-                    sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
-                    cudaSafeCall( cudaGetLastError() );
-
-                    sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                            (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    break;
-                }
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                R result[4] = {0, 0, 0, 0};
-                cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
-
-                sum[0] = result[0];
-                sum[1] = result[1];
-                sum[2] = result[2];
-                sum[3] = result[3];
-            }
-
-            template void sumMultipassCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sumMultipassCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sumMultipassCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sumMultipassCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sumMultipassCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sumMultipassCaller<float>(const PtrStepSzb, PtrStepb, double*, int);
-
-
-            template <typename T>
-            void sumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
-            {
-                typedef typename SumType<T>::R R;
-
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                switch (cn)
-                {
-                case 1:
-                    sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
-                    break;
-                case 2:
-                    sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
-                    break;
-                case 3:
-                    sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
-                    break;
-                case 4:
-                    sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
-                    break;
-                }
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                R result[4] = {0, 0, 0, 0};
-                cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
-
-                sum[0] = result[0];
-                sum[1] = result[1];
-                sum[2] = result[2];
-                sum[3] = result[3];
-            }
-
-            template void sumCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sumCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sumCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sumCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sumCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sumCaller<float>(const PtrStepSzb, PtrStepb, double*, int);
-
-
-            template <typename T>
-            void absSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
-            {
-                typedef typename SumType<T>::R R;
-
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                switch (cn)
-                {
-                case 1:
-                    sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
-                    cudaSafeCall( cudaGetLastError() );
-
-                    sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                            (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    break;
-                case 2:
-                    sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
-                    cudaSafeCall( cudaGetLastError() );
-
-                    sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                            (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    break;
-                case 3:
-                    sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
-                    cudaSafeCall( cudaGetLastError() );
-
-                    sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                            (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    break;
-                case 4:
-                    sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
-                    cudaSafeCall( cudaGetLastError() );
-
-                    sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                            (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    break;
-                }
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                R result[4] = {0, 0, 0, 0};
-                cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
-
-                sum[0] = result[0];
-                sum[1] = result[1];
-                sum[2] = result[2];
-                sum[3] = result[3];
-            }
-
-            template void absSumMultipassCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
-            template void absSumMultipassCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
-            template void absSumMultipassCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
-            template void absSumMultipassCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
-            template void absSumMultipassCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
-            template void absSumMultipassCaller<float>(const PtrStepSzb, PtrStepb, double*, int);
-
-
-            template <typename T>
-            void absSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
-            {
-                typedef typename SumType<T>::R R;
-
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                switch (cn)
-                {
-                case 1:
-                    sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
-                    break;
-                case 2:
-                    sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
-                    break;
-                case 3:
-                    sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
-                    break;
-                case 4:
-                    sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
-                    break;
-                }
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                R result[4] = {0, 0, 0, 0};
-                cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
-
-                sum[0] = result[0];
-                sum[1] = result[1];
-                sum[2] = result[2];
-                sum[3] = result[3];
-            }
-
-            template void absSumCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
-            template void absSumCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
-            template void absSumCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
-            template void absSumCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
-            template void absSumCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
-            template void absSumCaller<float>(const PtrStepSzb, PtrStepb, double*, int);
-
-
-            template <typename T>
-            void sqrSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
-            {
-                typedef typename SumType<T>::R R;
-
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                switch (cn)
-                {
-                case 1:
-                    sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
-                    cudaSafeCall( cudaGetLastError() );
-
-                    sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                            (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    break;
-                case 2:
-                    sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
-                    cudaSafeCall( cudaGetLastError() );
-
-                    sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                            (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    break;
-                case 3:
-                    sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
-                    cudaSafeCall( cudaGetLastError() );
-
-                    sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                            (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    break;
-                case 4:
-                    sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
-                    cudaSafeCall( cudaGetLastError() );
-
-                    sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                            (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    break;
-                }
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                R result[4] = {0, 0, 0, 0};
-                cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
-
-                sum[0] = result[0];
-                sum[1] = result[1];
-                sum[2] = result[2];
-                sum[3] = result[3];
-            }
-
-            template void sqrSumMultipassCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sqrSumMultipassCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sqrSumMultipassCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sqrSumMultipassCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sqrSumMultipassCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sqrSumMultipassCaller<float>(const PtrStepSzb, PtrStepb, double*, int);
-
-
-            template <typename T>
-            void sqrSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
-            {
-                typedef double R;
-
-                dim3 threads, grid;
-                estimateThreadCfg(src.cols, src.rows, threads, grid);
-                setKernelConsts(src.cols, src.rows, threads, grid);
-
-                switch (cn)
-                {
-                case 1:
-                    sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
-                    break;
-                case 2:
-                    sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
-                    break;
-                case 3:
-                    sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
-                    break;
-                case 4:
-                    sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                            src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
-                    break;
-                }
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                R result[4] = {0, 0, 0, 0};
-                cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
-
-                sum[0] = result[0];
-                sum[1] = result[1];
-                sum[2] = result[2];
-                sum[3] = result[3];
-            }
-
-            template void sqrSumCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sqrSumCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sqrSumCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sqrSumCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sqrSumCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
-            template void sqrSumCaller<float>(const PtrStepSzb, PtrStepb, double*, int);
-        } // namespace sum
-
-        //////////////////////////////////////////////////////////////////////////////
-        // reduce
-
-        template <typename S> struct SumReductor
+        template <typename R>
+        static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
         {
-            __device__ __forceinline__ S startValue() const
-            {
-                return 0;
-            }
+            return thrust::tie(val.x, val.y, val.z);
+        }
 
-            __device__ __forceinline__ SumReductor(const SumReductor& other){}
-            __device__ __forceinline__ SumReductor(){}
-
-            __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
-            {
-                return a + b;
-            }
-
-            __device__ __forceinline__ S result(S r, double) const
-            {
-                return r;
-            }
-        };
-
-        template <typename S> struct AvgReductor
+        template <class Op>
+        static __device__ __forceinline__ const thrust::tuple<Op, Op, Op> op(const Op& op)
         {
-            __device__ __forceinline__ S startValue() const
-            {
-                return 0;
-            }
-
-            __device__ __forceinline__ AvgReductor(const AvgReductor& other){}
-            __device__ __forceinline__ AvgReductor(){}
-
-            __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
-            {
-                return a + b;
-            }
-
-            __device__ __forceinline__ double result(S r, double sz) const
-            {
-                return r / sz;
-            }
-        };
-
-        template <typename S> struct MinReductor
+            return thrust::make_tuple(op, op, op);
+        }
+    };
+    template <> struct Unroll<4>
+    {
+        template <int BLOCK_SIZE, typename R>
+        static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
         {
-            __device__ __forceinline__ S startValue() const
-            {
-                return numeric_limits<S>::max();
-            }
+            return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
+        }
 
-            __device__ __forceinline__ MinReductor(const MinReductor& other){}
-            __device__ __forceinline__ MinReductor(){}
-
-            template <typename T> __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const
-            {
-                return saturate_cast<T>(::min(a, b));
-            }
-            __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
-            {
-                return ::fmin(a, b);
-            }
-
-            __device__ __forceinline__ S result(S r, double) const
-            {
-                return r;
-            }
-        };
-
-        template <typename S> struct MaxReductor
+        template <typename R>
+        static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
         {
-            __device__ __forceinline__ S startValue() const
-            {
-                return numeric_limits<S>::min();
-            }
+            return thrust::tie(val.x, val.y, val.z, val.w);
+        }
 
-            __device__ __forceinline__ MaxReductor(const MaxReductor& other){}
-            __device__ __forceinline__ MaxReductor(){}
-
-            template <typename T> __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const
-            {
-                return ::max(a, b);
-            }
-            __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
-            {
-                return ::fmax(a, b);
-            }
-
-            __device__ __forceinline__ S result(S r, double) const
-            {
-                return r;
-            }
-        };
-
-        template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const PtrStepSz<T> src, D* dst, const Op op)
+        template <class Op>
+        static __device__ __forceinline__ const thrust::tuple<Op, Op, Op, Op> op(const Op& op)
         {
-            __shared__ S smem[16 * 16];
+            return thrust::make_tuple(op, op, op, op);
+        }
+    };
+}
 
-            const int x = blockIdx.x * 16 + threadIdx.x;
+/////////////////////////////////////////////////////////////
+// sum
 
-            S myVal = op.startValue();
+namespace sum
+{
+    __device__ unsigned int blocks_finished = 0;
 
-            if (x < src.cols)
+    template <typename R, int cn> struct AtomicAdd;
+    template <typename R> struct AtomicAdd<R, 1>
+    {
+        static __device__ void run(R* ptr, R val)
+        {
+            ::atomicAdd(ptr, val);
+        }
+    };
+    template <typename R> struct AtomicAdd<R, 2>
+    {
+        typedef typename TypeVec<R, 2>::vec_type val_type;
+
+        static __device__ void run(R* ptr, val_type val)
+        {
+            ::atomicAdd(ptr, val.x);
+            ::atomicAdd(ptr + 1, val.y);
+        }
+    };
+    template <typename R> struct AtomicAdd<R, 3>
+    {
+        typedef typename TypeVec<R, 3>::vec_type val_type;
+
+        static __device__ void run(R* ptr, val_type val)
+        {
+            ::atomicAdd(ptr, val.x);
+            ::atomicAdd(ptr + 1, val.y);
+            ::atomicAdd(ptr + 2, val.z);
+        }
+    };
+    template <typename R> struct AtomicAdd<R, 4>
+    {
+        typedef typename TypeVec<R, 4>::vec_type val_type;
+
+        static __device__ void run(R* ptr, val_type val)
+        {
+            ::atomicAdd(ptr, val.x);
+            ::atomicAdd(ptr + 1, val.y);
+            ::atomicAdd(ptr + 2, val.z);
+            ::atomicAdd(ptr + 3, val.w);
+        }
+    };
+
+    template <int BLOCK_SIZE, typename R, int cn>
+    struct GlobalReduce
+    {
+        typedef typename TypeVec<R, cn>::vec_type result_type;
+
+        static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem)
+        {
+        #if __CUDA_ARCH__ >= 200
+            if (tid == 0)
+                AtomicAdd<R, cn>::run((R*) result, sum);
+        #else
+            __shared__ bool is_last;
+
+            if (tid == 0)
             {
-                for (int y = threadIdx.y; y < src.rows; y += 16)
-                    myVal = op(myVal, src.ptr(y)[x]);
+                result[bid] = sum;
+
+                __threadfence();
+
+                unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
+                is_last = (ticket == gridDim.x * gridDim.y - 1);
             }
 
-            smem[threadIdx.x * 16 + threadIdx.y] = myVal;
             __syncthreads();
 
-            if (threadIdx.x < 8)
+            if (is_last)
             {
-                volatile S* srow = smem + threadIdx.y * 16;
-                srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]);
-                srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]);
-                srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]);
-                srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]);
-            }
-            __syncthreads();
+                sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
 
-            if (threadIdx.y == 0 && x < src.cols)
-                dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows));
-        }
+                device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
 
-        template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const PtrStepSz<T>& src, PtrStepSz<D> dst, cudaStream_t stream)
-        {
-            const dim3 block(16, 16);
-            const dim3 grid(divUp(src.cols, block.x));
-
-            Op<S> op;
-            reduceRows<Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-        }
-
-        template <typename T, typename S, typename D> void reduceRows_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSz<T>& src, PtrStepSz<D> dst, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                reduceRows_caller<SumReductor, T, S, D>,
-                reduceRows_caller<AvgReductor, T, S, D>,
-                reduceRows_caller<MaxReductor, T, S, D>,
-                reduceRows_caller<MinReductor, T, S, D>
-            };
-
-            callers[reduceOp](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<D> >(dst), stream);
-        }
-
-        template void reduceRows_gpu<uchar, int, uchar>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<uchar, int, int>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<uchar, int, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-
-        template void reduceRows_gpu<ushort, int, ushort>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<ushort, int, int>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<ushort, int, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-
-        template void reduceRows_gpu<short, int, short>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<short, int, int>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<short, int, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-
-        template void reduceRows_gpu<int, int, int>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<int, int, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-
-        template void reduceRows_gpu<float, float, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-
-
-
-        template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const PtrStepSz<T> src, D* dst, const Op op)
-        {
-            __shared__ S smem[256 * cn];
-
-            const int y = blockIdx.x;
-
-            const T* src_row = src.ptr(y);
-
-            S myVal[cn];
-
-            #pragma unroll
-            for (int c = 0; c < cn; ++c)
-                myVal[c] = op.startValue();
-
-        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
-
-            // For cc >= 2.0 prefer L1 cache
-            for (int x = threadIdx.x; x < src.cols; x += 256)
-            {
-                #pragma unroll
-                for (int c = 0; c < cn; ++c)
-                    myVal[c] = op(myVal[c], src_row[x * cn + c]);
-            }
-
-        #else // __CUDA_ARCH__ >= 200
-
-            // For older arch use shared memory for cache
-            for (int x = 0; x < src.cols; x += 256)
-            {
-                #pragma unroll
-                for (int c = 0; c < cn; ++c)
+                if (tid == 0)
                 {
-                    smem[c * 256 + threadIdx.x] = op.startValue();
-                    const int load_x = x * cn + c * 256 + threadIdx.x;
-                    if (load_x < src.cols * cn)
-                        smem[c * 256 + threadIdx.x] = src_row[load_x];
-                }
-                __syncthreads();
-
-                #pragma unroll
-                for (int c = 0; c < cn; ++c)
-                    myVal[c] = op(myVal[c], smem[threadIdx.x * cn + c]);
-                __syncthreads();
-            }
-
-        #endif // __CUDA_ARCH__ >= 200
-
-            #pragma unroll
-            for (int c = 0; c < cn; ++c)
-                smem[c * 256 + threadIdx.x] = myVal[c];
-            __syncthreads();
-
-            if (threadIdx.x < 128)
-            {
-                #pragma unroll
-                for (int c = 0; c < cn; ++c)
-                    smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 128]);
-            }
-            __syncthreads();
-
-            if (threadIdx.x < 64)
-            {
-                #pragma unroll
-                for (int c = 0; c < cn; ++c)
-                    smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 64]);
-            }
-            __syncthreads();
-
-            volatile S* sdata = smem;
-
-            if (threadIdx.x < 32)
-            {
-                #pragma unroll
-                for (int c = 0; c < cn; ++c)
-                {
-                    sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 32]);
-                    sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 16]);
-                    sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 8]);
-                    sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 4]);
-                    sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 2]);
-                    sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 1]);
+                    result[0] = sum;
+                    blocks_finished = 0;
                 }
             }
+        #endif
+        }
+    };
+    template <int BLOCK_SIZE, int cn>
+    struct GlobalReduce<BLOCK_SIZE, double, cn>
+    {
+        typedef typename TypeVec<double, cn>::vec_type result_type;
+
+        static __device__ void run(result_type& sum, result_type* result, int tid, int bid, double* smem)
+        {
+            __shared__ bool is_last;
+
+            if (tid == 0)
+            {
+                result[bid] = sum;
+
+                __threadfence();
+
+                unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
+                is_last = (ticket == gridDim.x * gridDim.y - 1);
+            }
+
             __syncthreads();
 
-            if (threadIdx.x < cn)
-                dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols));
-        }
-
-        template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const PtrStepSz<T>& src, PtrStepSz<D> dst, cudaStream_t stream)
-        {
-            const dim3 block(256);
-            const dim3 grid(src.rows);
-
-            Op<S> op;
-            reduceCols<cn, Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-        }
-
-        template <typename T, typename S, typename D> void reduceCols_gpu(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSz<T>& src, PtrStepSz<D> dst, cudaStream_t stream);
-
-            static const caller_t callers[4][4] =
+            if (is_last)
             {
-                {reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},
-                {reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},
-                {reduceCols_caller<3, SumReductor, T, S, D>, reduceCols_caller<3, AvgReductor, T, S, D>, reduceCols_caller<3, MaxReductor, T, S, D>, reduceCols_caller<3, MinReductor, T, S, D>},
-                {reduceCols_caller<4, SumReductor, T, S, D>, reduceCols_caller<4, AvgReductor, T, S, D>, reduceCols_caller<4, MaxReductor, T, S, D>, reduceCols_caller<4, MinReductor, T, S, D>},
-            };
+                sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
 
-            callers[cn - 1][reduceOp](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<D> >(dst), stream);
+                device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<double>()));
+
+                if (tid == 0)
+                {
+                    result[0] = sum;
+                    blocks_finished = 0;
+                }
+            }
+        }
+    };
+
+    template <int BLOCK_SIZE, typename src_type, typename result_type, class Op>
+    __global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Op op, const int twidth, const int theight)
+    {
+        typedef typename VecTraits<src_type>::elem_type T;
+        typedef typename VecTraits<result_type>::elem_type R;
+        const int cn = VecTraits<src_type>::cn;
+
+        __shared__ R smem[BLOCK_SIZE * cn];
+
+        const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+
+        result_type sum = VecTraits<result_type>::all(0);
+
+        for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
+        {
+            const src_type* ptr = src.ptr(y);
+
+            for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
+            {
+                const src_type srcVal = ptr[x];
+
+                sum = sum + op(saturate_cast<result_type>(srcVal));
+            }
         }
 
-        template void reduceCols_gpu<uchar, int, uchar>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template void reduceCols_gpu<uchar, int, int>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template void reduceCols_gpu<uchar, int, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
 
-        template void reduceCols_gpu<ushort, int, ushort>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template void reduceCols_gpu<ushort, int, int>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template void reduceCols_gpu<ushort, int, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+        GlobalReduce<BLOCK_SIZE, R, cn>::run(sum, result, tid, bid, smem);
+    }
 
-        template void reduceCols_gpu<short, int, short>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template void reduceCols_gpu<short, int, int>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template void reduceCols_gpu<short, int, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+    const int threads_x = 32;
+    const int threads_y = 8;
 
-        template void reduceCols_gpu<int, int, int>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template void reduceCols_gpu<int, int, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+    void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
+    {
+        block = dim3(threads_x, threads_y);
 
-        template void reduceCols_gpu<float, float, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-    } // namespace mattrix_reductions
-}}} // namespace cv { namespace gpu { namespace device
+        grid = dim3(divUp(cols, block.x * block.y),
+                    divUp(rows, block.y * block.x));
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+        grid.x = ::min(grid.x, block.x);
+        grid.y = ::min(grid.y, block.y);
+    }
+
+    void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows)
+    {
+        dim3 block, grid;
+        getLaunchCfg(cols, rows, block, grid);
+
+        bufcols = grid.x * grid.y * sizeof(double) * cn;
+        bufrows = 1;
+    }
+
+    template <typename T, typename R, int cn, template <typename> class Op>
+    void caller(PtrStepSzb src_, void* buf_, double* out)
+    {
+        typedef typename TypeVec<T, cn>::vec_type src_type;
+        typedef typename TypeVec<R, cn>::vec_type result_type;
+
+        PtrStepSz<src_type> src(src_);
+        result_type* buf = (result_type*) buf_;
+
+        dim3 block, grid;
+        getLaunchCfg(src.cols, src.rows, block, grid);
+
+        const int twidth = divUp(divUp(src.cols, grid.x), block.x);
+        const int theight = divUp(divUp(src.rows, grid.y), block.y);
+
+        Op<result_type> op;
+
+        kernel<threads_x * threads_y><<<grid, block>>>(src, buf, op, twidth, theight);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+
+        R result[4] = {0, 0, 0, 0};
+        cudaSafeCall( cudaMemcpy(&result, buf, sizeof(result_type), cudaMemcpyDeviceToHost) );
+
+        out[0] = result[0];
+        out[1] = result[1];
+        out[2] = result[2];
+        out[3] = result[3];
+    }
+
+    template <typename T> struct SumType;
+    template <> struct SumType<uchar> { typedef unsigned int R; };
+    template <> struct SumType<schar> { typedef int R; };
+    template <> struct SumType<ushort> { typedef unsigned int R; };
+    template <> struct SumType<short> { typedef int R; };
+    template <> struct SumType<int> { typedef int R; };
+    template <> struct SumType<float> { typedef float R; };
+    template <> struct SumType<double> { typedef double R; };
+
+    template <typename T, int cn>
+    void run(PtrStepSzb src, void* buf, double* out)
+    {
+        typedef typename SumType<T>::R R;
+        caller<T, R, cn, identity>(src, buf, out);
+    }
+
+    template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out);
+    template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out);
+    template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out);
+    template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void run<schar, 1>(PtrStepSzb src, void* buf, double* out);
+    template void run<schar, 2>(PtrStepSzb src, void* buf, double* out);
+    template void run<schar, 3>(PtrStepSzb src, void* buf, double* out);
+    template void run<schar, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out);
+    template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out);
+    template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out);
+    template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void run<short, 1>(PtrStepSzb src, void* buf, double* out);
+    template void run<short, 2>(PtrStepSzb src, void* buf, double* out);
+    template void run<short, 3>(PtrStepSzb src, void* buf, double* out);
+    template void run<short, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void run<int, 1>(PtrStepSzb src, void* buf, double* out);
+    template void run<int, 2>(PtrStepSzb src, void* buf, double* out);
+    template void run<int, 3>(PtrStepSzb src, void* buf, double* out);
+    template void run<int, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void run<float, 1>(PtrStepSzb src, void* buf, double* out);
+    template void run<float, 2>(PtrStepSzb src, void* buf, double* out);
+    template void run<float, 3>(PtrStepSzb src, void* buf, double* out);
+    template void run<float, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void run<double, 1>(PtrStepSzb src, void* buf, double* out);
+    template void run<double, 2>(PtrStepSzb src, void* buf, double* out);
+    template void run<double, 3>(PtrStepSzb src, void* buf, double* out);
+    template void run<double, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template <typename T, int cn>
+    void runAbs(PtrStepSzb src, void* buf, double* out)
+    {
+        typedef typename SumType<T>::R R;
+        caller<T, R, cn, abs_func>(src, buf, out);
+    }
+
+    template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template <typename T> struct Sqr : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(T x) const
+        {
+            return x * x;
+        }
+    };
+
+    template <typename T, int cn>
+    void runSqr(PtrStepSzb src, void* buf, double* out)
+    {
+        caller<T, double, cn, Sqr>(src, buf, out);
+    }
+
+    template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out);
+
+    template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out);
+}
+
+/////////////////////////////////////////////////////////////
+// minMax
+
+namespace minMax
+{
+    __device__ unsigned int blocks_finished = 0;
+
+    // To avoid shared bank conflicts we convert each value into value of
+    // appropriate type (32 bits minimum)
+    template <typename T> struct MinMaxTypeTraits;
+    template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<schar> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
+    template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
+
+    template <int BLOCK_SIZE, typename R>
+    struct GlobalReduce
+    {
+        static __device__ void run(R& mymin, R& mymax, R* minval, R* maxval, int tid, int bid, R* sminval, R* smaxval)
+        {
+            __shared__ bool is_last;
+
+            if (tid == 0)
+            {
+                minval[bid] = mymin;
+                maxval[bid] = mymax;
+
+                __threadfence();
+
+                unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
+                is_last = (ticket == gridDim.x * gridDim.y - 1);
+            }
+
+            __syncthreads();
+
+            if (is_last)
+            {
+                int idx = ::min(tid, gridDim.x * gridDim.y - 1);
+
+                mymin = minval[idx];
+                mymax = maxval[idx];
+
+                const minimum<R> minOp;
+                const maximum<R> maxOp;
+                device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
+
+                if (tid == 0)
+                {
+                    minval[0] = mymin;
+                    maxval[0] = mymax;
+
+                    blocks_finished = 0;
+                }
+            }
+        }
+    };
+    template <int BLOCK_SIZE>
+    struct GlobalReduce<BLOCK_SIZE, int>
+    {
+        static __device__ void run(int& mymin, int& mymax, int* minval, int* maxval, int tid, int bid, int* sminval, int* smaxval)
+        {
+        #if __CUDA_ARCH__ >= 200
+            if (tid == 0)
+            {
+                ::atomicMin(minval, mymin);
+                ::atomicMax(maxval, mymax);
+            }
+        #else
+            __shared__ bool is_last;
+
+            if (tid == 0)
+            {
+                minval[bid] = mymin;
+                maxval[bid] = mymax;
+
+                __threadfence();
+
+                unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
+                is_last = (ticket == gridDim.x * gridDim.y - 1);
+            }
+
+            __syncthreads();
+
+            if (is_last)
+            {
+                int idx = ::min(tid, gridDim.x * gridDim.y - 1);
+
+                mymin = minval[idx];
+                mymax = maxval[idx];
+
+                const minimum<int> minOp;
+                const maximum<int> maxOp;
+                device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
+
+                if (tid == 0)
+                {
+                    minval[0] = mymin;
+                    maxval[0] = mymax;
+
+                    blocks_finished = 0;
+                }
+            }
+        #endif
+        }
+    };
+
+    template <int BLOCK_SIZE, typename T, typename R, class Mask>
+    __global__ void kernel(const PtrStepSz<T> src, const Mask mask, R* minval, R* maxval, const int twidth, const int theight)
+    {
+        __shared__ R sminval[BLOCK_SIZE];
+        __shared__ R smaxval[BLOCK_SIZE];
+
+        const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+
+        R mymin = numeric_limits<R>::max();
+        R mymax = -numeric_limits<R>::max();
+
+        const minimum<R> minOp;
+        const maximum<R> maxOp;
+
+        for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
+        {
+            const T* ptr = src.ptr(y);
+
+            for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
+            {
+                if (mask(y, x))
+                {
+                    const R srcVal = ptr[x];
+
+                    mymin = minOp(mymin, srcVal);
+                    mymax = maxOp(mymax, srcVal);
+                }
+            }
+        }
+
+        device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
+
+        GlobalReduce<BLOCK_SIZE, R>::run(mymin, mymax, minval, maxval, tid, bid, sminval, smaxval);
+    }
+
+    const int threads_x = 32;
+    const int threads_y = 8;
+
+    void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
+    {
+        block = dim3(threads_x, threads_y);
+
+        grid = dim3(divUp(cols, block.x * block.y),
+                    divUp(rows, block.y * block.x));
+
+        grid.x = ::min(grid.x, block.x);
+        grid.y = ::min(grid.y, block.y);
+    }
+
+    void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
+    {
+        dim3 block, grid;
+        getLaunchCfg(cols, rows, block, grid);
+
+        bufcols = grid.x * grid.y * sizeof(double);
+        bufrows = 2;
+    }
+
+    __global__ void setDefaultKernel(int* minval_buf, int* maxval_buf)
+    {
+        *minval_buf = numeric_limits<int>::max();
+        *maxval_buf = numeric_limits<int>::min();
+    }
+
+    template <typename R>
+    void setDefault(R*, R*)
+    {
+    }
+    void setDefault(int* minval_buf, int* maxval_buf)
+    {
+        setDefaultKernel<<<1, 1>>>(minval_buf, maxval_buf);
+    }
+
+    template <typename T>
+    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
+    {
+        typedef typename MinMaxTypeTraits<T>::best_type R;
+
+        dim3 block, grid;
+        getLaunchCfg(src.cols, src.rows, block, grid);
+
+        const int twidth = divUp(divUp(src.cols, grid.x), block.x);
+        const int theight = divUp(divUp(src.rows, grid.y), block.y);
+
+        R* minval_buf = (R*) buf.ptr(0);
+        R* maxval_buf = (R*) buf.ptr(1);
+
+        setDefault(minval_buf, maxval_buf);
+
+        if (mask.data)
+            kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, twidth, theight);
+        else
+            kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, twidth, theight);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+
+        R minval_, maxval_;
+        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
+        *minval = minval_;
+        *maxval = maxval_;
+    }
+
+    template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+}
+
+/////////////////////////////////////////////////////////////
+// minMaxLoc
+
+namespace minMaxLoc
+{
+    __device__ unsigned int blocks_finished = 0;
+
+    // To avoid shared bank conflicts we convert each value into value of
+    // appropriate type (32 bits minimum)
+    template <typename T> struct MinMaxTypeTraits;
+    template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<schar> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
+    template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
+
+    template <int BLOCK_SIZE, typename T, class Mask>
+    __global__ void kernel(const PtrStepSz<T> src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight)
+    {
+        typedef typename MinMaxTypeTraits<T>::best_type work_type;
+
+        __shared__ work_type sminval[BLOCK_SIZE];
+        __shared__ work_type smaxval[BLOCK_SIZE];
+        __shared__ unsigned int sminloc[BLOCK_SIZE];
+        __shared__ unsigned int smaxloc[BLOCK_SIZE];
+        __shared__ bool is_last;
+
+        const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+
+        work_type mymin = numeric_limits<work_type>::max();
+        work_type mymax = -numeric_limits<work_type>::max();
+        unsigned int myminloc = 0;
+        unsigned int mymaxloc = 0;
+
+        for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
+        {
+            const T* ptr = src.ptr(y);
+
+            for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
+            {
+                if (mask(y, x))
+                {
+                    const work_type srcVal = ptr[x];
+
+                    if (srcVal < mymin)
+                    {
+                        mymin = srcVal;
+                        myminloc = y * src.cols + x;
+                    }
+
+                    if (srcVal > mymax)
+                    {
+                        mymax = srcVal;
+                        mymaxloc = y * src.cols + x;
+                    }
+                }
+            }
+        }
+
+        reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
+                                 smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
+                                 tid,
+                                 thrust::make_tuple(less<work_type>(), greater<work_type>()));
+
+        if (tid == 0)
+        {
+            minval[bid] = (T) mymin;
+            maxval[bid] = (T) mymax;
+            minloc[bid] = myminloc;
+            maxloc[bid] = mymaxloc;
+
+            __threadfence();
+
+            unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+            is_last = (ticket == gridDim.x * gridDim.y - 1);
+        }
+
+        __syncthreads();
+
+        if (is_last)
+        {
+            unsigned int idx = ::min(tid, gridDim.x * gridDim.y - 1);
+
+            mymin = minval[idx];
+            mymax = maxval[idx];
+            myminloc = minloc[idx];
+            mymaxloc = maxloc[idx];
+
+            reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
+                                     smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
+                                     tid,
+                                     thrust::make_tuple(less<work_type>(), greater<work_type>()));
+
+            if (tid == 0)
+            {
+                minval[0] = (T) mymin;
+                maxval[0] = (T) mymax;
+                minloc[0] = myminloc;
+                maxloc[0] = mymaxloc;
+
+                blocks_finished = 0;
+            }
+        }
+    }
+
+    const int threads_x = 32;
+    const int threads_y = 8;
+
+    void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
+    {
+        block = dim3(threads_x, threads_y);
+
+        grid = dim3(divUp(cols, block.x * block.y),
+                    divUp(rows, block.y * block.x));
+
+        grid.x = ::min(grid.x, block.x);
+        grid.y = ::min(grid.y, block.y);
+    }
+
+    void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows)
+    {
+        dim3 block, grid;
+        getLaunchCfg(cols, rows, block, grid);
+
+        // For values
+        b1cols = grid.x * grid.y * elem_size;
+        b1rows = 2;
+
+        // For locations
+        b2cols = grid.x * grid.y * sizeof(int);
+        b2rows = 2;
+    }
+
+    template <typename T>
+    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf)
+    {
+        dim3 block, grid;
+        getLaunchCfg(src.cols, src.rows, block, grid);
+
+        const int twidth = divUp(divUp(src.cols, grid.x), block.x);
+        const int theight = divUp(divUp(src.rows, grid.y), block.y);
+
+        T* minval_buf = (T*) valbuf.ptr(0);
+        T* maxval_buf = (T*) valbuf.ptr(1);
+        unsigned int* minloc_buf = locbuf.ptr(0);
+        unsigned int* maxloc_buf = locbuf.ptr(1);
+
+        if (mask.data)
+            kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
+        else
+            kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+
+        T minval_, maxval_;
+        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+        *minval = minval_;
+        *maxval = maxval_;
+
+        unsigned int minloc_, maxloc_;
+        cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
+        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
+    }
+
+    template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+}
+
+/////////////////////////////////////////////////////////////
+// countNonZero
+
+namespace countNonZero
+{
+    __device__ unsigned int blocks_finished = 0;
+
+    template <int BLOCK_SIZE, typename T>
+    __global__ void kernel(const PtrStepSz<T> src, unsigned int* count, const int twidth, const int theight)
+    {
+        __shared__ unsigned int scount[BLOCK_SIZE];
+
+        const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        unsigned int mycount = 0;
+
+        for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
+        {
+            const T* ptr = src.ptr(y);
+
+            for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
+            {
+                const T srcVal = ptr[x];
+
+                mycount += (srcVal != 0);
+            }
+        }
+
+        device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
+
+    #if __CUDA_ARCH__ >= 200
+        if (tid == 0)
+            ::atomicAdd(count, mycount);
+    #else
+        __shared__ bool is_last;
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+
+        if (tid == 0)
+        {
+            count[bid] = mycount;
+
+            __threadfence();
+
+            unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+            is_last = (ticket == gridDim.x * gridDim.y - 1);
+        }
+
+        __syncthreads();
+
+        if (is_last)
+        {
+            mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;
+
+            device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
+
+            if (tid == 0)
+            {
+                count[0] = mycount;
+
+                blocks_finished = 0;
+            }
+        }
+    #endif
+    }
+
+    const int threads_x = 32;
+    const int threads_y = 8;
+
+    void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
+    {
+        block = dim3(threads_x, threads_y);
+
+        grid = dim3(divUp(cols, block.x * block.y),
+                    divUp(rows, block.y * block.x));
+
+        grid.x = ::min(grid.x, block.x);
+        grid.y = ::min(grid.y, block.y);
+    }
+
+    void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
+    {
+        dim3 block, grid;
+        getLaunchCfg(cols, rows, block, grid);
+
+        bufcols = grid.x * grid.y * sizeof(int);
+        bufrows = 1;
+    }
+
+    template <typename T>
+    int run(const PtrStepSzb src, PtrStep<unsigned int> buf)
+    {
+        dim3 block, grid;
+        getLaunchCfg(src.cols, src.rows, block, grid);
+
+        const int twidth = divUp(divUp(src.cols, grid.x), block.x);
+        const int theight = divUp(divUp(src.rows, grid.y), block.y);
+
+        unsigned int* count_buf = buf.ptr(0);
+
+        cudaSafeCall( cudaMemset(count_buf, 0, sizeof(unsigned int)) );
+
+        kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, count_buf, twidth, theight);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+
+        unsigned int count;
+        cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost));
+
+        return count;
+    }
+
+    template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    template int run<int   >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// reduce
+
+namespace reduce
+{
+    struct Sum
+    {
+        template <typename T>
+        __device__ __forceinline__ T startValue() const
+        {
+            return VecTraits<T>::all(0);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ T operator ()(T a, T b) const
+        {
+            return a + b;
+        }
+
+        template <typename T>
+        __device__ __forceinline__ T result(T r, double) const
+        {
+            return r;
+        }
+
+        __device__ __forceinline__ Sum() {}
+        __device__ __forceinline__ Sum(const Sum&) {}
+    };
+
+    struct Avg
+    {
+        template <typename T>
+        __device__ __forceinline__ T startValue() const
+        {
+            return VecTraits<T>::all(0);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ T operator ()(T a, T b) const
+        {
+            return a + b;
+        }
+
+        template <typename T>
+        __device__ __forceinline__ typename TypeVec<double, VecTraits<T>::cn>::vec_type result(T r, double sz) const
+        {
+            return r / sz;
+        }
+
+        __device__ __forceinline__ Avg() {}
+        __device__ __forceinline__ Avg(const Avg&) {}
+    };
+
+    struct Min
+    {
+        template <typename T>
+        __device__ __forceinline__ T startValue() const
+        {
+            return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
+        }
+
+        template <typename T>
+        __device__ __forceinline__ T operator ()(T a, T b) const
+        {
+            minimum<T> minOp;
+            return minOp(a, b);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ T result(T r, double) const
+        {
+            return r;
+        }
+
+        __device__ __forceinline__ Min() {}
+        __device__ __forceinline__ Min(const Min&) {}
+    };
+
+    struct Max
+    {
+        template <typename T>
+        __device__ __forceinline__ T startValue() const
+        {
+            return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
+        }
+
+        template <typename T>
+        __device__ __forceinline__ T operator ()(T a, T b) const
+        {
+            maximum<T> maxOp;
+            return maxOp(a, b);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ T result(T r, double) const
+        {
+            return r;
+        }
+
+        __device__ __forceinline__ Max() {}
+        __device__ __forceinline__ Max(const Max&) {}
+    };
+
+    ///////////////////////////////////////////////////////////
+
+    template <typename T, typename S, typename D, class Op>
+    __global__ void rowsKernel(const PtrStepSz<T> src, D* dst, const Op op)
+    {
+        __shared__ S smem[16 * 16];
+
+        const int x = blockIdx.x * 16 + threadIdx.x;
+
+        S myVal = op.template startValue<S>();
+
+        if (x < src.cols)
+        {
+            for (int y = threadIdx.y; y < src.rows; y += 16)
+            {
+                S srcVal = src(y, x);
+                myVal = op(myVal, srcVal);
+            }
+        }
+
+        smem[threadIdx.x * 16 + threadIdx.y] = myVal;
+
+        __syncthreads();
+
+        volatile S* srow = smem + threadIdx.y * 16;
+
+        myVal = srow[threadIdx.x];
+        device::reduce<16>(srow, myVal, threadIdx.x, op);
+
+        if (threadIdx.x == 0)
+            srow[0] = myVal;
+
+        __syncthreads();
+
+        if (threadIdx.y == 0 && x < src.cols)
+            dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows);
+    }
+
+    template <typename T, typename S, typename D, class Op>
+    void rowsCaller(PtrStepSz<T> src, D* dst, cudaStream_t stream)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(src.cols, block.x));
+
+        Op op;
+        rowsKernel<T, S, D, Op><<<grid, block, 0, stream>>>(src, dst, op);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T, typename S, typename D>
+    void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSz<T> src, D* dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            rowsCaller<T, S, D, Sum>,
+            rowsCaller<T, S, D, Avg>,
+            rowsCaller<T, S, D, Max>,
+            rowsCaller<T, S, D, Min>
+        };
+
+        funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
+    }
+
+    template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<unsigned char, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    template void rows<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<unsigned short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<unsigned short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<unsigned short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    template void rows<short, int, short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    template void rows<int, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<int, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<int, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    ///////////////////////////////////////////////////////////
+
+    template <int BLOCK_SIZE, typename T, typename S, typename D, int cn, class Op>
+    __global__ void colsKernel(const PtrStepSz<typename TypeVec<T, cn>::vec_type> src, typename TypeVec<D, cn>::vec_type* dst, const Op op)
+    {
+        typedef typename TypeVec<T, cn>::vec_type src_type;
+        typedef typename TypeVec<S, cn>::vec_type work_type;
+        typedef typename TypeVec<D, cn>::vec_type dst_type;
+
+        __shared__ S smem[BLOCK_SIZE * cn];
+
+        const int y = blockIdx.x;
+
+        const src_type* srcRow = src.ptr(y);
+
+        work_type myVal = op.template startValue<work_type>();
+
+        for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE)
+            myVal = op(myVal, saturate_cast<work_type>(srcRow[x]));
+
+        device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(myVal), threadIdx.x, detail::Unroll<cn>::op(op));
+
+        if (threadIdx.x == 0)
+            dst[y] = saturate_cast<dst_type>(op.result(myVal, src.cols));
+    }
+
+    template <typename T, typename S, typename D, int cn, class Op> void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream)
+    {
+        const int BLOCK_SIZE = 256;
+
+        const dim3 block(BLOCK_SIZE);
+        const dim3 grid(src.rows);
+
+        Op op;
+        colsKernel<BLOCK_SIZE, T, S, D, cn, Op><<<grid, block, 0, stream>>>((PtrStepSz<typename TypeVec<T, cn>::vec_type>) src, (typename TypeVec<D, cn>::vec_type*) dst, op);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+    }
+
+    template <typename T, typename S, typename D> void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream);
+        static const func_t funcs[5][4] =
+        {
+            {0,0,0,0},
+            {colsCaller<T, S, D, 1, Sum>, colsCaller<T, S, D, 1, Avg>, colsCaller<T, S, D, 1, Max>, colsCaller<T, S, D, 1, Min>},
+            {colsCaller<T, S, D, 2, Sum>, colsCaller<T, S, D, 2, Avg>, colsCaller<T, S, D, 2, Max>, colsCaller<T, S, D, 2, Min>},
+            {colsCaller<T, S, D, 3, Sum>, colsCaller<T, S, D, 3, Avg>, colsCaller<T, S, D, 3, Max>, colsCaller<T, S, D, 3, Min>},
+            {colsCaller<T, S, D, 4, Sum>, colsCaller<T, S, D, 4, Avg>, colsCaller<T, S, D, 4, Max>, colsCaller<T, S, D, 4, Min>},
+        };
+
+        funcs[cn][op](src, dst, stream);
+    }
+
+    template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<unsigned char, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+
+    template void cols<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<unsigned short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<unsigned short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<unsigned short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+
+    template void cols<short, int, short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+
+    template void cols<int, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<int, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<int, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+
+    template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+
+    template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/nlm.cu b/modules/gpu/src/cuda/nlm.cu
index e267c733e0..cd3f0b5c3a 100644
--- a/modules/gpu/src/cuda/nlm.cu
+++ b/modules/gpu/src/cuda/nlm.cu
@@ -43,11 +43,11 @@
 
 #if !defined CUDA_DISABLER
 
-#include "internal_shared.hpp"
-
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/vec_traits.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/block.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"
 
 using namespace cv::gpu;
@@ -184,6 +184,85 @@ namespace cv { namespace gpu { namespace device
 {
     namespace imgproc
     {
+
+        template <int cn> struct Unroll;
+        template <> struct Unroll<1>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
+            {
+                return thrust::tie(val1, val2);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op);
+            }
+        };
+        template <> struct Unroll<2>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op);
+            }
+        };
+        template <> struct Unroll<3>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y, val2.z);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op, op);
+            }
+        };
+        template <> struct Unroll<4>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op, op, op);
+            }
+        };
+
         __device__ __forceinline__ int calcDist(const uchar&  a, const uchar&  b) { return (a-b)*(a-b); }
         __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
         __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
@@ -340,30 +419,15 @@ namespace cv { namespace gpu { namespace device
                     sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
                 }
 
-                volatile __shared__ float cta_buffer[CTA_SIZE];
+                __shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)];
 
-                int tid = threadIdx.x;
+                reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer),
+                                 Unroll<VecTraits<T>::cn>::tie(weights_sum, sum),
+                                 threadIdx.x,
+                                 Unroll<VecTraits<T>::cn>::op());
 
-                cta_buffer[tid] = weights_sum;
-                __syncthreads();
-                Block::reduce<CTA_SIZE>(cta_buffer, plus());
-                weights_sum = cta_buffer[0];
-
-                __syncthreads();
-
-
-                for(int n = 0; n < VecTraits<T>::cn; ++n)
-                {
-                    cta_buffer[tid] = reinterpret_cast<float*>(&sum)[n];
-                    __syncthreads();
-                    Block::reduce<CTA_SIZE>(cta_buffer, plus());
-                    reinterpret_cast<float*>(&sum)[n] = cta_buffer[0];
-
-                    __syncthreads();
-                }
-
-                if (tid == 0)
-                    dst = saturate_cast<T>(sum/weights_sum);
+                if (threadIdx.x == 0)
+                    dst = saturate_cast<T>(sum / weights_sum);
             }
 
             __device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const
@@ -503,4 +567,4 @@ namespace cv { namespace gpu { namespace device
 }}}
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/optflowbm.cu b/modules/gpu/src/cuda/optflowbm.cu
new file mode 100644
index 0000000000..f9090abdc0
--- /dev/null
+++ b/modules/gpu/src/cuda/optflowbm.cu
@@ -0,0 +1,414 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace optflowbm
+{
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp);
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize)
+    {
+        int s = 0;
+
+        for (int y = 0; y < blockSize.y; ++y)
+        {
+            for (int x = 0; x < blockSize.x; ++x)
+                s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y));
+        }
+
+        return s;
+    }
+
+    __global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
+                                  const int maxX, const int maxY, const int acceptLevel, const int escapeLevel,
+                                  const short2* ss, const int ssCount)
+    {
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (i >= velx.rows || j >= velx.cols)
+            return;
+
+        const int X1 = j * shiftSize.x;
+        const int Y1 = i * shiftSize.y;
+
+        const int offX = usePrevious ? __float2int_rn(velx(i, j)) : 0;
+        const int offY = usePrevious ? __float2int_rn(vely(i, j)) : 0;
+
+        int X2 = X1 + offX;
+        int Y2 = Y1 + offY;
+
+        int dist = numeric_limits<int>::max();
+
+        if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
+            dist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+
+        int countMin = 1;
+        int sumx = offX;
+        int sumy = offY;
+
+        if (dist > acceptLevel)
+        {
+            // do brute-force search
+            for (int k = 0; k < ssCount; ++k)
+            {
+                const short2 ssVal = ss[k];
+
+                const int dx = offX + ssVal.x;
+                const int dy = offY + ssVal.y;
+
+                X2 = X1 + dx;
+                Y2 = Y1 + dy;
+
+                if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
+                {
+                    const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+                    if (tmpDist < acceptLevel)
+                    {
+                        sumx = dx;
+                        sumy = dy;
+                        countMin = 1;
+                        break;
+                    }
+
+                    if (tmpDist < dist)
+                    {
+                        dist = tmpDist;
+                        sumx = dx;
+                        sumy = dy;
+                        countMin = 1;
+                    }
+                    else if (tmpDist == dist)
+                    {
+                        sumx += dx;
+                        sumy += dy;
+                        countMin++;
+                    }
+                }
+            }
+
+            if (dist > escapeLevel)
+            {
+                sumx = offX;
+                sumy = offY;
+                countMin = 1;
+            }
+        }
+
+        velx(i, j) = static_cast<float>(sumx) / countMin;
+        vely(i, j) = static_cast<float>(sumy) / countMin;
+    }
+
+    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
+              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream)
+    {
+        bindTexture(&tex_prev, prev);
+        bindTexture(&tex_curr, curr);
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y));
+
+        calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious,
+                                                  maxX, maxY, acceptLevel,  escapeLevel, ss, ssCount);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+/////////////////////////////////////////////////////////
+// Fast approximate version
+
+namespace optflowbm_fast
+{
+    enum
+    {
+        CTA_SIZE = 128,
+
+        TILE_COLS = 128,
+        TILE_ROWS = 32,
+
+        STRIDE = CTA_SIZE
+    };
+
+    template <typename T> __device__ __forceinline__ int calcDist(T a, T b)
+    {
+        return ::abs(a - b);
+    }
+
+    template <class T> struct FastOptFlowBM
+    {
+
+        int search_radius;
+        int block_radius;
+
+        int search_window;
+        int block_window;
+
+        PtrStepSz<T> I0;
+        PtrStep<T> I1;
+
+        mutable PtrStepi buffer;
+
+        FastOptFlowBM(int search_window_, int block_window_,
+                      PtrStepSz<T> I0_, PtrStepSz<T> I1_,
+                      PtrStepi buffer_) :
+            search_radius(search_window_ / 2), block_radius(block_window_ / 2),
+            search_window(search_window_), block_window(block_window_),
+            I0(I0_), I1(I1_),
+            buffer(buffer_)
+        {
+        }
+
+        __device__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                dist_sums[index] = 0;
+
+                for (int tx = 0; tx < block_window; ++tx)
+                    col_sums(tx, index) = 0;
+
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int ay = i;
+                int ax = j;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius;
+
+                for (int tx = -block_radius; tx <= block_radius; ++tx)
+                {
+                    int col_sum = 0;
+                    for (int ty = -block_radius; ty <= block_radius; ++ty)
+                    {
+                        int dist = calcDist(I0(ay + ty, ax + tx), I1(by + ty, bx + tx));
+
+                        dist_sums[index] += dist;
+                        col_sum += dist;
+                    }
+
+                    col_sums(tx + block_radius, index) = col_sum;
+                }
+
+                up_col_sums(j, index) = col_sums(block_window - 1, index);
+            }
+        }
+
+        __device__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int ay = i;
+                int ax = j + block_radius;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius + block_radius;
+
+                int col_sum = 0;
+
+                for (int ty = -block_radius; ty <= block_radius; ++ty)
+                    col_sum += calcDist(I0(ay + ty, ax), I1(by + ty, bx));
+
+                dist_sums[index] += col_sum - col_sums(first, index);
+
+                col_sums(first, index) = col_sum;
+                up_col_sums(j, index) = col_sum;
+            }
+        }
+
+        __device__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            int ay = i;
+            int ax = j + block_radius;
+
+            T a_up   = I0(ay - block_radius - 1, ax);
+            T a_down = I0(ay + block_radius, ax);
+
+            for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius + block_radius;
+
+                T b_up   = I1(by - block_radius - 1, bx);
+                T b_down = I1(by + block_radius, bx);
+
+                int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
+
+                dist_sums[index] += col_sum  - col_sums(first, index);
+                col_sums(first, index) = col_sum;
+                up_col_sums(j, index) = col_sum;
+            }
+        }
+
+        __device__ void convolve_window(int i, int j, const int* dist_sums, float& velx, float& vely) const
+        {
+            int bestDist = numeric_limits<int>::max();
+            int bestInd = -1;
+
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int curDist = dist_sums[index];
+                if (curDist < bestDist)
+                {
+                    bestDist = curDist;
+                    bestInd = index;
+                }
+            }
+
+            __shared__ int cta_dist_buffer[CTA_SIZE];
+            __shared__ int cta_ind_buffer[CTA_SIZE];
+
+            reduceKeyVal<CTA_SIZE>(cta_dist_buffer, bestDist, cta_ind_buffer, bestInd, threadIdx.x, less<int>());
+
+            if (threadIdx.x == 0)
+            {
+                int y = bestInd / search_window;
+                int x = bestInd - y * search_window;
+
+                velx = x - search_radius;
+                vely = y - search_radius;
+            }
+        }
+
+        __device__ void operator()(PtrStepf velx, PtrStepf vely) const
+        {
+            int tbx = blockIdx.x * TILE_COLS;
+            int tby = blockIdx.y * TILE_ROWS;
+
+            int tex = ::min(tbx + TILE_COLS, I0.cols);
+            int tey = ::min(tby + TILE_ROWS, I0.rows);
+
+            PtrStepi col_sums;
+            col_sums.data = buffer.ptr(I0.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
+            col_sums.step = buffer.step;
+
+            PtrStepi up_col_sums;
+            up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
+            up_col_sums.step = buffer.step;
+
+            extern __shared__ int dist_sums[]; //search_window * search_window
+
+            int first = 0;
+
+            for (int i = tby; i < tey; ++i)
+            {
+                for (int j = tbx; j < tex; ++j)
+                {
+                    __syncthreads();
+
+                    if (j == tbx)
+                    {
+                        initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
+                        first = 0;
+                    }
+                    else
+                    {
+                        if (i == tby)
+                          shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
+                        else
+                          shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
+
+                        first = (first + 1) % block_window;
+                    }
+
+                    __syncthreads();
+
+                    convolve_window(i, j, dist_sums, velx(i, j), vely(i, j));
+                }
+            }
+        }
+
+    };
+
+    template<typename T> __global__ void optflowbm_fast_kernel(const FastOptFlowBM<T> fbm, PtrStepf velx, PtrStepf vely)
+    {
+        fbm(velx, vely);
+    }
+
+    void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
+    {
+        dim3 grid(divUp(src_cols, TILE_COLS), divUp(src_rows, TILE_ROWS));
+
+        buffer_cols = search_window * search_window * grid.y;
+        buffer_rows = src_cols + block_window * grid.x;
+    }
+
+    template <typename T>
+    void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream)
+    {
+        FastOptFlowBM<T> fbm(search_window, block_window, I0, I1, buffer);
+
+        dim3 block(CTA_SIZE, 1);
+        dim3 grid(divUp(I0.cols, TILE_COLS), divUp(I0.rows, TILE_ROWS));
+
+        size_t smem = search_window * search_window * sizeof(int);
+
+        optflowbm_fast_kernel<<<grid, block, smem, stream>>>(fbm, velx, vely);
+        cudaSafeCall ( cudaGetLastError () );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void calc<uchar>(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
+}
+
+#endif // !defined CUDA_DISABLER
diff --git a/modules/gpu/src/cuda/optical_flow.cu b/modules/gpu/src/cuda/optical_flow.cu
index 0c8d140f90..d2c68a79fa 100644
--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
@@ -164,40 +164,40 @@ namespace cv { namespace gpu { namespace device
 
                 r = ::fmin(r, 2.5f);
 
-                v[1].x = arrow_x + r * ::cosf(theta - CV_PI / 2.0f);
-                v[1].y = arrow_y + r * ::sinf(theta - CV_PI / 2.0f);
+                v[1].x = arrow_x + r * ::cosf(theta - CV_PI_F / 2.0f);
+                v[1].y = arrow_y + r * ::sinf(theta - CV_PI_F / 2.0f);
 
-                v[4].x = arrow_x + r * ::cosf(theta + CV_PI / 2.0f);
-                v[4].y = arrow_y + r * ::sinf(theta + CV_PI / 2.0f);
+                v[4].x = arrow_x + r * ::cosf(theta + CV_PI_F / 2.0f);
+                v[4].y = arrow_y + r * ::sinf(theta + CV_PI_F / 2.0f);
 
                 int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
 
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                 vertex_data[indx++] = v[0].x * xscale;
                 vertex_data[indx++] = v[0].y * yscale;
                 vertex_data[indx++] = v[0].z;
 
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                 vertex_data[indx++] = v[1].x * xscale;
                 vertex_data[indx++] = v[1].y * yscale;
                 vertex_data[indx++] = v[1].z;
 
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                 vertex_data[indx++] = v[2].x * xscale;
                 vertex_data[indx++] = v[2].y * yscale;
                 vertex_data[indx++] = v[2].z;
 
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                 vertex_data[indx++] = v[3].x * xscale;
                 vertex_data[indx++] = v[3].y * yscale;
                 vertex_data[indx++] = v[3].z;
 
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                 vertex_data[indx++] = v[4].x * xscale;
                 vertex_data[indx++] = v[4].y * yscale;
                 vertex_data[indx++] = v[4].z;
 
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                 vertex_data[indx++] = v[5].x * xscale;
                 vertex_data[indx++] = v[5].y * yscale;
                 vertex_data[indx++] = v[5].z;
@@ -217,4 +217,4 @@ namespace cv { namespace gpu { namespace device
     }
 }}}
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/optical_flow_farneback.cu b/modules/gpu/src/cuda/optical_flow_farneback.cu
index 8231775b84..5bbca34f18 100644
--- a/modules/gpu/src/cuda/optical_flow_farneback.cu
+++ b/modules/gpu/src/cuda/optical_flow_farneback.cu
@@ -42,7 +42,6 @@
 
 #if !defined CUDA_DISABLER
 
-#include <stdio.h>
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"
@@ -57,8 +56,6 @@
 #define BORDER_SIZE 5
 #define MAX_KSIZE_HALF 100
 
-using namespace std;
-
 namespace cv { namespace gpu { namespace device { namespace optflow_farneback
 {
     __constant__ float c_g[8];
diff --git a/modules/gpu/src/cuda/orb.cu b/modules/gpu/src/cuda/orb.cu
index 2d441a472a..95706dfa39 100644
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@@ -47,10 +47,11 @@
 
 #if !defined CUDA_DISABLER
 
+#include <thrust/device_ptr.h>
 #include <thrust/sort.h>
 
 #include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/functional.hpp"
 
 namespace cv { namespace gpu { namespace device
@@ -75,9 +76,9 @@ namespace cv { namespace gpu { namespace device
 
         __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
         {
-            __shared__ int smem[8 * 32];
-
-            volatile int* srow = smem + threadIdx.y * blockDim.x;
+            __shared__ int smem0[8 * 32];
+            __shared__ int smem1[8 * 32];
+            __shared__ int smem2[8 * 32];
 
             const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
 
@@ -109,9 +110,12 @@ namespace cv { namespace gpu { namespace device
                     c += Ix * Iy;
                 }
 
-                reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
-                reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
-                reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
+                int* srow0 = smem0 + threadIdx.y * blockDim.x;
+                int* srow1 = smem1 + threadIdx.y * blockDim.x;
+                int* srow2 = smem2 + threadIdx.y * blockDim.x;
+
+                plus<int> op;
+                reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));
 
                 if (threadIdx.x == 0)
                 {
@@ -151,9 +155,13 @@ namespace cv { namespace gpu { namespace device
 
         __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
         {
-            __shared__ int smem[8 * 32];
+            __shared__ int smem0[8 * 32];
+            __shared__ int smem1[8 * 32];
 
-            volatile int* srow = smem + threadIdx.y * blockDim.x;
+            int* srow0 = smem0 + threadIdx.y * blockDim.x;
+            int* srow1 = smem1 + threadIdx.y * blockDim.x;
+
+            plus<int> op;
 
             const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
 
@@ -167,7 +175,7 @@ namespace cv { namespace gpu { namespace device
                 for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
                     m_10 += u * image(loc.y, loc.x + u);
 
-                reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
+                reduce<32>(srow0, m_10, threadIdx.x, op);
 
                 for (int v = 1; v <= half_k; ++v)
                 {
@@ -185,8 +193,7 @@ namespace cv { namespace gpu { namespace device
                         m_sum += u * (val_plus + val_minus);
                     }
 
-                    reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
-                    reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
+                    reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));
 
                     m_10 += m_sum;
                     m_01 += v * v_sum;
@@ -419,4 +426,4 @@ namespace cv { namespace gpu { namespace device
     }
 }}}
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/pyrlk.cu b/modules/gpu/src/cuda/pyrlk.cu
index d1a65c210f..8d746143c8 100644
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
@@ -52,244 +52,187 @@
 #include "opencv2/gpu/device/functional.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 
-namespace cv { namespace gpu { namespace device
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace pyrlk
 {
-    namespace pyrlk
+    __constant__ int c_winSize_x;
+    __constant__ int c_winSize_y;
+    __constant__ int c_halfWin_x;
+    __constant__ int c_halfWin_y;
+    __constant__ int c_iters;
+
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
+
+    template <int cn> struct Tex_I;
+    template <> struct Tex_I<1>
     {
-        __constant__ int c_winSize_x;
-        __constant__ int c_winSize_y;
-
-        __constant__ int c_halfWin_x;
-        __constant__ int c_halfWin_y;
-
-        __constant__ int c_iters;
-
-        void loadConstants(int2 winSize, int iters)
+        static __device__ __forceinline__ float read(float x, float y)
         {
-            cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
+            return tex2D(tex_If, x, y);
+        }
+    };
+    template <> struct Tex_I<4>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_If4, x, y);
+        }
+    };
 
-            int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-            cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
+    template <int cn> struct Tex_J;
+    template <> struct Tex_J<1>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return tex2D(tex_Jf, x, y);
+        }
+    };
+    template <> struct Tex_J<4>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_Jf4, x, y);
+        }
+    };
 
-            cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
+    __device__ __forceinline__ void accum(float& dst, float val)
+    {
+        dst += val;
+    }
+    __device__ __forceinline__ void accum(float& dst, const float4& val)
+    {
+        dst += val.x + val.y + val.z;
+    }
+
+    __device__ __forceinline__ float abs_(float a)
+    {
+        return ::fabsf(a);
+    }
+    __device__ __forceinline__ float4 abs_(const float4& a)
+    {
+        return abs(a);
+    }
+
+    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
+    __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
+    {
+    #if __CUDA_ARCH__ <= 110
+        const int BLOCK_SIZE = 128;
+    #else
+        const int BLOCK_SIZE = 256;
+    #endif
+
+        __shared__ float smem1[BLOCK_SIZE];
+        __shared__ float smem2[BLOCK_SIZE];
+        __shared__ float smem3[BLOCK_SIZE];
+
+        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        float2 prevPt = prevPts[blockIdx.x];
+        prevPt.x *= (1.0f / (1 << level));
+        prevPt.y *= (1.0f / (1 << level));
+
+        if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
+        {
+            if (tid == 0 && level == 0)
+                status[blockIdx.x] = 0;
+
+            return;
         }
 
-        __device__ void reduce(float& val1, float& val2, float& val3, float* smem1, float* smem2, float* smem3, int tid)
+        prevPt.x -= c_halfWin_x;
+        prevPt.y -= c_halfWin_y;
+
+        // extract the patch from the first image, compute covariation matrix of derivatives
+
+        float A11 = 0;
+        float A12 = 0;
+        float A22 = 0;
+
+        typedef typename TypeVec<float, cn>::vec_type work_type;
+
+        work_type I_patch   [PATCH_Y][PATCH_X];
+        work_type dIdx_patch[PATCH_Y][PATCH_X];
+        work_type dIdy_patch[PATCH_Y][PATCH_X];
+
+        for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
         {
-            smem1[tid] = val1;
-            smem2[tid] = val2;
-            smem3[tid] = val3;
-            __syncthreads();
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
-            if (tid < 128)
+            for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
             {
-                smem1[tid] = val1 += smem1[tid + 128];
-                smem2[tid] = val2 += smem2[tid + 128];
-                smem3[tid] = val3 += smem3[tid + 128];
-            }
-            __syncthreads();
-#endif
+                float x = prevPt.x + xBase + 0.5f;
+                float y = prevPt.y + yBase + 0.5f;
 
-            if (tid < 64)
-            {
-                smem1[tid] = val1 += smem1[tid + 64];
-                smem2[tid] = val2 += smem2[tid + 64];
-                smem3[tid] = val3 += smem3[tid + 64];
-            }
-            __syncthreads();
+                I_patch[i][j] = Tex_I<cn>::read(x, y);
 
-            if (tid < 32)
-            {
-                volatile float* vmem1 = smem1;
-                volatile float* vmem2 = smem2;
-                volatile float* vmem3 = smem3;
+                // Sharr Deriv
 
-                vmem1[tid] = val1 += vmem1[tid + 32];
-                vmem2[tid] = val2 += vmem2[tid + 32];
-                vmem3[tid] = val3 += vmem3[tid + 32];
+                work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
+                                 (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1));
 
-                vmem1[tid] = val1 += vmem1[tid + 16];
-                vmem2[tid] = val2 += vmem2[tid + 16];
-                vmem3[tid] = val3 += vmem3[tid + 16];
+                work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
+                                (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1));
 
-                vmem1[tid] = val1 += vmem1[tid + 8];
-                vmem2[tid] = val2 += vmem2[tid + 8];
-                vmem3[tid] = val3 += vmem3[tid + 8];
+                dIdx_patch[i][j] = dIdx;
+                dIdy_patch[i][j] = dIdy;
 
-                vmem1[tid] = val1 += vmem1[tid + 4];
-                vmem2[tid] = val2 += vmem2[tid + 4];
-                vmem3[tid] = val3 += vmem3[tid + 4];
-
-                vmem1[tid] = val1 += vmem1[tid + 2];
-                vmem2[tid] = val2 += vmem2[tid + 2];
-                vmem3[tid] = val3 += vmem3[tid + 2];
-
-                vmem1[tid] = val1 += vmem1[tid + 1];
-                vmem2[tid] = val2 += vmem2[tid + 1];
-                vmem3[tid] = val3 += vmem3[tid + 1];
+                accum(A11, dIdx * dIdx);
+                accum(A12, dIdx * dIdy);
+                accum(A22, dIdy * dIdy);
             }
         }
 
-        __device__ void reduce(float& val1, float& val2, float* smem1, float* smem2, int tid)
+        reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
+
+    #if __CUDA_ARCH__ >= 300
+        if (tid == 0)
         {
-            smem1[tid] = val1;
-            smem2[tid] = val2;
-            __syncthreads();
+            smem1[0] = A11;
+            smem2[0] = A12;
+            smem3[0] = A22;
+        }
+    #endif
 
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
-            if (tid < 128)
-            {
-                smem1[tid] = val1 += smem1[tid + 128];
-                smem2[tid] = val2 += smem2[tid + 128];
-            }
-            __syncthreads();
-#endif
+        __syncthreads();
 
-            if (tid < 64)
-            {
-                smem1[tid] = val1 += smem1[tid + 64];
-                smem2[tid] = val2 += smem2[tid + 64];
-            }
-            __syncthreads();
+        A11 = smem1[0];
+        A12 = smem2[0];
+        A22 = smem3[0];
 
-            if (tid < 32)
-            {
-                volatile float* vmem1 = smem1;
-                volatile float* vmem2 = smem2;
+        float D = A11 * A22 - A12 * A12;
 
-                vmem1[tid] = val1 += vmem1[tid + 32];
-                vmem2[tid] = val2 += vmem2[tid + 32];
+        if (D < numeric_limits<float>::epsilon())
+        {
+            if (tid == 0 && level == 0)
+                status[blockIdx.x] = 0;
 
-                vmem1[tid] = val1 += vmem1[tid + 16];
-                vmem2[tid] = val2 += vmem2[tid + 16];
-
-                vmem1[tid] = val1 += vmem1[tid + 8];
-                vmem2[tid] = val2 += vmem2[tid + 8];
-
-                vmem1[tid] = val1 += vmem1[tid + 4];
-                vmem2[tid] = val2 += vmem2[tid + 4];
-
-                vmem1[tid] = val1 += vmem1[tid + 2];
-                vmem2[tid] = val2 += vmem2[tid + 2];
-
-                vmem1[tid] = val1 += vmem1[tid + 1];
-                vmem2[tid] = val2 += vmem2[tid + 1];
-            }
+            return;
         }
 
-        __device__ void reduce(float& val1, float* smem1, int tid)
+        D = 1.f / D;
+
+        A11 *= D;
+        A12 *= D;
+        A22 *= D;
+
+        float2 nextPt = nextPts[blockIdx.x];
+        nextPt.x *= 2.f;
+        nextPt.y *= 2.f;
+
+        nextPt.x -= c_halfWin_x;
+        nextPt.y -= c_halfWin_y;
+
+        for (int k = 0; k < c_iters; ++k)
         {
-            smem1[tid] = val1;
-            __syncthreads();
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
-            if (tid < 128)
-            {
-                smem1[tid] = val1 += smem1[tid + 128];
-            }
-            __syncthreads();
-#endif
-
-            if (tid < 64)
-            {
-                smem1[tid] = val1 += smem1[tid + 64];
-            }
-            __syncthreads();
-
-            if (tid < 32)
-            {
-                volatile float* vmem1 = smem1;
-
-                vmem1[tid] = val1 += vmem1[tid + 32];
-                vmem1[tid] = val1 += vmem1[tid + 16];
-                vmem1[tid] = val1 += vmem1[tid + 8];
-                vmem1[tid] = val1 += vmem1[tid + 4];
-                vmem1[tid] = val1 += vmem1[tid + 2];
-                vmem1[tid] = val1 += vmem1[tid + 1];
-            }
-        }
-
-        texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
-        texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-        texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-        texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
-        texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-        template <int cn> struct Tex_I;
-        template <> struct Tex_I<1>
-        {
-            static __device__ __forceinline__ float read(float x, float y)
-            {
-                return tex2D(tex_If, x, y);
-            }
-        };
-        template <> struct Tex_I<4>
-        {
-            static __device__ __forceinline__ float4 read(float x, float y)
-            {
-                return tex2D(tex_If4, x, y);
-            }
-        };
-
-        template <int cn> struct Tex_J;
-        template <> struct Tex_J<1>
-        {
-            static __device__ __forceinline__ float read(float x, float y)
-            {
-                return tex2D(tex_Jf, x, y);
-            }
-        };
-        template <> struct Tex_J<4>
-        {
-            static __device__ __forceinline__ float4 read(float x, float y)
-            {
-                return tex2D(tex_Jf4, x, y);
-            }
-        };
-
-        __device__ __forceinline__ void accum(float& dst, float val)
-        {
-            dst += val;
-        }
-        __device__ __forceinline__ void accum(float& dst, const float4& val)
-        {
-            dst += val.x + val.y + val.z;
-        }
-
-        __device__ __forceinline__ float abs_(float a)
-        {
-            return ::fabs(a);
-        }
-        __device__ __forceinline__ float4 abs_(const float4& a)
-        {
-            return fabs(a);
-        }
-
-        template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
-        __global__ void lkSparse(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
-        {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ <= 110)
-            __shared__ float smem1[128];
-            __shared__ float smem2[128];
-            __shared__ float smem3[128];
-#else
-            __shared__ float smem1[256];
-            __shared__ float smem2[256];
-            __shared__ float smem3[256];
-#endif
-
-            const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-            float2 prevPt = prevPts[blockIdx.x];
-            prevPt.x *= (1.0f / (1 << level));
-            prevPt.y *= (1.0f / (1 << level));
-
-            if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
+            if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
             {
                 if (tid == 0 && level == 0)
                     status[blockIdx.x] = 0;
@@ -297,279 +240,183 @@ namespace cv { namespace gpu { namespace device
                 return;
             }
 
-            prevPt.x -= c_halfWin_x;
-            prevPt.y -= c_halfWin_y;
+            float b1 = 0;
+            float b2 = 0;
 
-            // extract the patch from the first image, compute covariation matrix of derivatives
-
-            float A11 = 0;
-            float A12 = 0;
-            float A22 = 0;
-
-            typedef typename TypeVec<float, cn>::vec_type work_type;
-
-            work_type I_patch   [PATCH_Y][PATCH_X];
-            work_type dIdx_patch[PATCH_Y][PATCH_X];
-            work_type dIdy_patch[PATCH_Y][PATCH_X];
-
-            for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
+            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
             {
-                for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
+                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
                 {
-                    float x = prevPt.x + xBase + 0.5f;
-                    float y = prevPt.y + yBase + 0.5f;
+                    work_type I_val = I_patch[i][j];
+                    work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
 
-                    I_patch[i][j] = Tex_I<cn>::read(x, y);
+                    work_type diff = (J_val - I_val) * 32.0f;
 
-                    // Sharr Deriv
-
-                    work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
-                                     (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1));
-
-                    work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
-                                    (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1));
-
-                    dIdx_patch[i][j] = dIdx;
-                    dIdy_patch[i][j] = dIdy;
-
-                    accum(A11, dIdx * dIdx);
-                    accum(A12, dIdx * dIdy);
-                    accum(A22, dIdy * dIdy);
+                    accum(b1, diff * dIdx_patch[i][j]);
+                    accum(b2, diff * dIdy_patch[i][j]);
                 }
             }
 
-            reduce(A11, A12, A22, smem1, smem2, smem3, tid);
-            __syncthreads();
-
-            A11 = smem1[0];
-            A12 = smem2[0];
-            A22 = smem3[0];
-
-            float D = A11 * A22 - A12 * A12;
-
-            if (D < numeric_limits<float>::epsilon())
-            {
-                if (tid == 0 && level == 0)
-                    status[blockIdx.x] = 0;
-
-                return;
-            }
-
-            D = 1.f / D;
-
-            A11 *= D;
-            A12 *= D;
-            A22 *= D;
-
-            float2 nextPt = nextPts[blockIdx.x];
-            nextPt.x *= 2.f;
-            nextPt.y *= 2.f;
-
-            nextPt.x -= c_halfWin_x;
-            nextPt.y -= c_halfWin_y;
-
-            for (int k = 0; k < c_iters; ++k)
-            {
-                if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
-                {
-                    if (tid == 0 && level == 0)
-                        status[blockIdx.x] = 0;
-
-                    return;
-                }
-
-                float b1 = 0;
-                float b2 = 0;
-
-                for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
-                {
-                    for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
-                    {
-                        work_type I_val = I_patch[i][j];
-                        work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
-
-                        work_type diff = (J_val - I_val) * 32.0f;
-
-                        accum(b1, diff * dIdx_patch[i][j]);
-                        accum(b2, diff * dIdy_patch[i][j]);
-                    }
-                }
-
-                reduce(b1, b2, smem1, smem2, tid);
-                __syncthreads();
-
-                b1 = smem1[0];
-                b2 = smem2[0];
-
-                float2 delta;
-                delta.x = A12 * b2 - A22 * b1;
-                delta.y = A12 * b1 - A11 * b2;
-
-                nextPt.x += delta.x;
-                nextPt.y += delta.y;
-
-                if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
-                    break;
-            }
-
-            float errval = 0;
-            if (calcErr)
-            {
-                for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
-                {
-                    for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
-                    {
-                        work_type I_val = I_patch[i][j];
-                        work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
-
-                        work_type diff = J_val - I_val;
-
-                        accum(errval, abs_(diff));
-                    }
-                }
-
-                reduce(errval, smem1, tid);
-            }
+            reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
 
+        #if __CUDA_ARCH__ >= 300
             if (tid == 0)
             {
-                nextPt.x += c_halfWin_x;
-                nextPt.y += c_halfWin_y;
-
-                nextPts[blockIdx.x] = nextPt;
-
-                if (calcErr)
-                    err[blockIdx.x] = static_cast<float>(errval) / (cn * c_winSize_x * c_winSize_y);
-            }
-        }
-
-        template <int cn, int PATCH_X, int PATCH_Y>
-        void lkSparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-            int level, dim3 block, cudaStream_t stream)
-        {
-            dim3 grid(ptcount);
-
-            if (level == 0 && err)
-                lkSparse<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
-            else
-                lkSparse<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void lkSparse1_gpu(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-            int level, dim3 block, dim3 patch, cudaStream_t stream)
-        {
-            typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                int level, dim3 block, cudaStream_t stream);
-
-            static const func_t funcs[5][5] =
-            {
-                {lkSparse_caller<1, 1, 1>, lkSparse_caller<1, 2, 1>, lkSparse_caller<1, 3, 1>, lkSparse_caller<1, 4, 1>, lkSparse_caller<1, 5, 1>},
-                {lkSparse_caller<1, 1, 2>, lkSparse_caller<1, 2, 2>, lkSparse_caller<1, 3, 2>, lkSparse_caller<1, 4, 2>, lkSparse_caller<1, 5, 2>},
-                {lkSparse_caller<1, 1, 3>, lkSparse_caller<1, 2, 3>, lkSparse_caller<1, 3, 3>, lkSparse_caller<1, 4, 3>, lkSparse_caller<1, 5, 3>},
-                {lkSparse_caller<1, 1, 4>, lkSparse_caller<1, 2, 4>, lkSparse_caller<1, 3, 4>, lkSparse_caller<1, 4, 4>, lkSparse_caller<1, 5, 4>},
-                {lkSparse_caller<1, 1, 5>, lkSparse_caller<1, 2, 5>, lkSparse_caller<1, 3, 5>, lkSparse_caller<1, 4, 5>, lkSparse_caller<1, 5, 5>}
-            };
-
-            bindTexture(&tex_If, I);
-            bindTexture(&tex_Jf, J);
-
-            funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
-                level, block, stream);
-        }
-
-        void lkSparse4_gpu(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-            int level, dim3 block, dim3 patch, cudaStream_t stream)
-        {
-            typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                int level, dim3 block, cudaStream_t stream);
-
-            static const func_t funcs[5][5] =
-            {
-                {lkSparse_caller<4, 1, 1>, lkSparse_caller<4, 2, 1>, lkSparse_caller<4, 3, 1>, lkSparse_caller<4, 4, 1>, lkSparse_caller<4, 5, 1>},
-                {lkSparse_caller<4, 1, 2>, lkSparse_caller<4, 2, 2>, lkSparse_caller<4, 3, 2>, lkSparse_caller<4, 4, 2>, lkSparse_caller<4, 5, 2>},
-                {lkSparse_caller<4, 1, 3>, lkSparse_caller<4, 2, 3>, lkSparse_caller<4, 3, 3>, lkSparse_caller<4, 4, 3>, lkSparse_caller<4, 5, 3>},
-                {lkSparse_caller<4, 1, 4>, lkSparse_caller<4, 2, 4>, lkSparse_caller<4, 3, 4>, lkSparse_caller<4, 4, 4>, lkSparse_caller<4, 5, 4>},
-                {lkSparse_caller<4, 1, 5>, lkSparse_caller<4, 2, 5>, lkSparse_caller<4, 3, 5>, lkSparse_caller<4, 4, 5>, lkSparse_caller<4, 5, 5>}
-            };
-
-            bindTexture(&tex_If4, I);
-            bindTexture(&tex_Jf4, J);
-
-            funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
-                level, block, stream);
-        }
-
-        template <bool calcErr>
-        __global__ void lkDense(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
-        {
-            extern __shared__ int smem[];
-
-            const int patchWidth  = blockDim.x + 2 * c_halfWin_x;
-            const int patchHeight = blockDim.y + 2 * c_halfWin_y;
-
-            int* I_patch = smem;
-            int* dIdx_patch = I_patch + patchWidth * patchHeight;
-            int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
-
-            const int xBase = blockIdx.x * blockDim.x;
-            const int yBase = blockIdx.y * blockDim.y;
-
-            for (int i = threadIdx.y; i < patchHeight; i += blockDim.y)
-            {
-                for (int j = threadIdx.x; j < patchWidth; j += blockDim.x)
-                {
-                    float x = xBase - c_halfWin_x + j + 0.5f;
-                    float y = yBase - c_halfWin_y + i + 0.5f;
-
-                    I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y);
-
-                    // Sharr Deriv
-
-                    dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) -
-                                                    (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1));
-
-                    dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) -
-                                                    (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1));
-                }
+                smem1[0] = b1;
+                smem2[0] = b2;
             }
+        #endif
 
             __syncthreads();
 
-            const int x = xBase + threadIdx.x;
-            const int y = yBase + threadIdx.y;
+            b1 = smem1[0];
+            b2 = smem2[0];
 
-            if (x >= cols || y >= rows)
-                return;
+            float2 delta;
+            delta.x = A12 * b2 - A22 * b1;
+            delta.y = A12 * b1 - A11 * b2;
 
-            int A11i = 0;
-            int A12i = 0;
-            int A22i = 0;
+            nextPt.x += delta.x;
+            nextPt.y += delta.y;
 
-            for (int i = 0; i < c_winSize_y; ++i)
+            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
+                break;
+        }
+
+        float errval = 0;
+        if (calcErr)
+        {
+            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
             {
-                for (int j = 0; j < c_winSize_x; ++j)
+                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
                 {
-                    int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-                    int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+                    work_type I_val = I_patch[i][j];
+                    work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
 
-                    A11i += dIdx * dIdx;
-                    A12i += dIdx * dIdy;
-                    A22i += dIdy * dIdy;
+                    work_type diff = J_val - I_val;
+
+                    accum(errval, abs_(diff));
                 }
             }
 
-            float A11 = A11i;
-            float A12 = A12i;
-            float A22 = A22i;
+            reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
+        }
 
-            float D = A11 * A22 - A12 * A12;
+        if (tid == 0)
+        {
+            nextPt.x += c_halfWin_x;
+            nextPt.y += c_halfWin_y;
 
-            if (D < numeric_limits<float>::epsilon())
+            nextPts[blockIdx.x] = nextPt;
+
+            if (calcErr)
+                err[blockIdx.x] = static_cast<float>(errval) / (cn * c_winSize_x * c_winSize_y);
+        }
+    }
+
+    template <int cn, int PATCH_X, int PATCH_Y>
+    void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                       int level, dim3 block, cudaStream_t stream)
+    {
+        dim3 grid(ptcount);
+
+        if (level == 0 && err)
+            sparseKernel<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
+        else
+            sparseKernel<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <bool calcErr>
+    __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
+    {
+        extern __shared__ int smem[];
+
+        const int patchWidth  = blockDim.x + 2 * c_halfWin_x;
+        const int patchHeight = blockDim.y + 2 * c_halfWin_y;
+
+        int* I_patch = smem;
+        int* dIdx_patch = I_patch + patchWidth * patchHeight;
+        int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
+
+        const int xBase = blockIdx.x * blockDim.x;
+        const int yBase = blockIdx.y * blockDim.y;
+
+        for (int i = threadIdx.y; i < patchHeight; i += blockDim.y)
+        {
+            for (int j = threadIdx.x; j < patchWidth; j += blockDim.x)
+            {
+                float x = xBase - c_halfWin_x + j + 0.5f;
+                float y = yBase - c_halfWin_y + i + 0.5f;
+
+                I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y);
+
+                // Sharr Deriv
+
+                dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) -
+                                                (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1));
+
+                dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) -
+                                                (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1));
+            }
+        }
+
+        __syncthreads();
+
+        const int x = xBase + threadIdx.x;
+        const int y = yBase + threadIdx.y;
+
+        if (x >= cols || y >= rows)
+            return;
+
+        int A11i = 0;
+        int A12i = 0;
+        int A22i = 0;
+
+        for (int i = 0; i < c_winSize_y; ++i)
+        {
+            for (int j = 0; j < c_winSize_x; ++j)
+            {
+                int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+                int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+
+                A11i += dIdx * dIdx;
+                A12i += dIdx * dIdy;
+                A22i += dIdy * dIdy;
+            }
+        }
+
+        float A11 = A11i;
+        float A12 = A12i;
+        float A22 = A22i;
+
+        float D = A11 * A22 - A12 * A12;
+
+        if (D < numeric_limits<float>::epsilon())
+        {
+            if (calcErr)
+                err(y, x) = numeric_limits<float>::max();
+
+            return;
+        }
+
+        D = 1.f / D;
+
+        A11 *= D;
+        A12 *= D;
+        A22 *= D;
+
+        float2 nextPt;
+        nextPt.x = x + prevU(y/2, x/2) * 2.0f;
+        nextPt.y = y + prevV(y/2, x/2) * 2.0f;
+
+        for (int k = 0; k < c_iters; ++k)
+        {
+            if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
             {
                 if (calcErr)
                     err(y, x) = numeric_limits<float>::max();
@@ -577,108 +424,142 @@ namespace cv { namespace gpu { namespace device
                 return;
             }
 
-            D = 1.f / D;
+            int b1 = 0;
+            int b2 = 0;
 
-            A11 *= D;
-            A12 *= D;
-            A22 *= D;
-
-            float2 nextPt;
-            nextPt.x = x + prevU(y/2, x/2) * 2.0f;
-            nextPt.y = y + prevV(y/2, x/2) * 2.0f;
-
-            for (int k = 0; k < c_iters; ++k)
+            for (int i = 0; i < c_winSize_y; ++i)
             {
-                if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
+                for (int j = 0; j < c_winSize_x; ++j)
                 {
-                    if (calcErr)
-                        err(y, x) = numeric_limits<float>::max();
+                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
+                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
 
-                    return;
+                    int diff = (J - I) * 32;
+
+                    int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+                    int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+
+                    b1 += diff * dIdx;
+                    b2 += diff * dIdy;
                 }
-
-                int b1 = 0;
-                int b2 = 0;
-
-                for (int i = 0; i < c_winSize_y; ++i)
-                {
-                    for (int j = 0; j < c_winSize_x; ++j)
-                    {
-                        int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
-                        int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
-
-                        int diff = (J - I) * 32;
-
-                        int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-                        int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-
-                        b1 += diff * dIdx;
-                        b2 += diff * dIdy;
-                    }
-                }
-
-                float2 delta;
-                delta.x = A12 * b2 - A22 * b1;
-                delta.y = A12 * b1 - A11 * b2;
-
-                nextPt.x += delta.x;
-                nextPt.y += delta.y;
-
-                if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
-                    break;
             }
 
-            u(y, x) = nextPt.x - x;
-            v(y, x) = nextPt.y - y;
+            float2 delta;
+            delta.x = A12 * b2 - A22 * b1;
+            delta.y = A12 * b1 - A11 * b2;
 
-            if (calcErr)
-            {
-                int errval = 0;
+            nextPt.x += delta.x;
+            nextPt.y += delta.y;
 
-                for (int i = 0; i < c_winSize_y; ++i)
-                {
-                    for (int j = 0; j < c_winSize_x; ++j)
-                    {
-                        int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
-                        int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
-
-                        errval += ::abs(J - I);
-                    }
-                }
-
-                err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
-            }
+            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
+                break;
         }
 
-        void lkDense_gpu(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
-                         PtrStepSzf err, int2 winSize, cudaStream_t stream)
+        u(y, x) = nextPt.x - x;
+        v(y, x) = nextPt.y - y;
+
+        if (calcErr)
         {
-            dim3 block(16, 16);
-            dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
+            int errval = 0;
 
-            bindTexture(&tex_Ib, I);
-            bindTexture(&tex_Jf, J);
-
-            int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-            const int patchWidth  = block.x + 2 * halfWin.x;
-            const int patchHeight = block.y + 2 * halfWin.y;
-            size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
-
-            if (err.data)
+            for (int i = 0; i < c_winSize_y; ++i)
             {
-                lkDense<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
-                cudaSafeCall( cudaGetLastError() );
-            }
-            else
-            {
-                lkDense<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
-                cudaSafeCall( cudaGetLastError() );
+                for (int j = 0; j < c_winSize_x; ++j)
+                {
+                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
+                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
+
+                    errval += ::abs(J - I);
+                }
             }
 
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
+            err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
         }
     }
-}}}
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+    void loadConstants(int2 winSize, int iters)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
+
+        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
+        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
+
+        cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
+    }
+
+    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                 int level, dim3 block, dim3 patch, cudaStream_t stream)
+    {
+        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                               int level, dim3 block, cudaStream_t stream);
+
+        static const func_t funcs[5][5] =
+        {
+            {sparse_caller<1, 1, 1>, sparse_caller<1, 2, 1>, sparse_caller<1, 3, 1>, sparse_caller<1, 4, 1>, sparse_caller<1, 5, 1>},
+            {sparse_caller<1, 1, 2>, sparse_caller<1, 2, 2>, sparse_caller<1, 3, 2>, sparse_caller<1, 4, 2>, sparse_caller<1, 5, 2>},
+            {sparse_caller<1, 1, 3>, sparse_caller<1, 2, 3>, sparse_caller<1, 3, 3>, sparse_caller<1, 4, 3>, sparse_caller<1, 5, 3>},
+            {sparse_caller<1, 1, 4>, sparse_caller<1, 2, 4>, sparse_caller<1, 3, 4>, sparse_caller<1, 4, 4>, sparse_caller<1, 5, 4>},
+            {sparse_caller<1, 1, 5>, sparse_caller<1, 2, 5>, sparse_caller<1, 3, 5>, sparse_caller<1, 4, 5>, sparse_caller<1, 5, 5>}
+        };
+
+        bindTexture(&tex_If, I);
+        bindTexture(&tex_Jf, J);
+
+        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
+            level, block, stream);
+    }
+
+    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                 int level, dim3 block, dim3 patch, cudaStream_t stream)
+    {
+        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                               int level, dim3 block, cudaStream_t stream);
+
+        static const func_t funcs[5][5] =
+        {
+            {sparse_caller<4, 1, 1>, sparse_caller<4, 2, 1>, sparse_caller<4, 3, 1>, sparse_caller<4, 4, 1>, sparse_caller<4, 5, 1>},
+            {sparse_caller<4, 1, 2>, sparse_caller<4, 2, 2>, sparse_caller<4, 3, 2>, sparse_caller<4, 4, 2>, sparse_caller<4, 5, 2>},
+            {sparse_caller<4, 1, 3>, sparse_caller<4, 2, 3>, sparse_caller<4, 3, 3>, sparse_caller<4, 4, 3>, sparse_caller<4, 5, 3>},
+            {sparse_caller<4, 1, 4>, sparse_caller<4, 2, 4>, sparse_caller<4, 3, 4>, sparse_caller<4, 4, 4>, sparse_caller<4, 5, 4>},
+            {sparse_caller<4, 1, 5>, sparse_caller<4, 2, 5>, sparse_caller<4, 3, 5>, sparse_caller<4, 4, 5>, sparse_caller<4, 5, 5>}
+        };
+
+        bindTexture(&tex_If4, I);
+        bindTexture(&tex_Jf4, J);
+
+        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
+            level, block, stream);
+    }
+
+    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
+    {
+        dim3 block(16, 16);
+        dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
+
+        bindTexture(&tex_Ib, I);
+        bindTexture(&tex_Jf, J);
+
+        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
+        const int patchWidth  = block.x + 2 * halfWin.x;
+        const int patchHeight = block.y + 2 * halfWin.y;
+        size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
+
+        if (err.data)
+        {
+            denseKernel<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
+            cudaSafeCall( cudaGetLastError() );
+        }
+        else
+        {
+            denseKernel<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/remap.cu b/modules/gpu/src/cuda/remap.cu
index dc80b132ad..f40ada0302 100644
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@@ -69,7 +69,7 @@ namespace cv { namespace gpu { namespace device
 
         template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
         {
-            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
+            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
             {
                 typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
 
@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
 
         template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
         {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, int)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, bool)
             {
                 (void)srcWhole;
                 (void)xoff;
@@ -124,10 +124,10 @@ namespace cv { namespace gpu { namespace device
             template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
             { \
                 static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float* borderValue, int cc) \
+                    PtrStepSz< type > dst, const float* borderValue, bool cc20) \
                 { \
                     typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc >= 20 ? 8 : 4); \
+                    dim3 block(32, cc20 ? 8 : 4); \
                     dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
                     bindTexture(&tex_remap_ ## type , srcWhole); \
                     tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
@@ -142,7 +142,7 @@ namespace cv { namespace gpu { namespace device
             template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
             { \
                 static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float*, int) \
+                    PtrStepSz< type > dst, const float*, bool) \
                 { \
                     dim3 block(32, 8); \
                     dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
@@ -194,20 +194,20 @@ namespace cv { namespace gpu { namespace device
         template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
         {
             static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
             {
                 if (stream == 0)
-                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc);
+                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc20);
                 else
-                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
+                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc20);
             }
         };
 
         template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
         {
             typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
 
             static const caller_t callers[3][5] =
             {
@@ -235,40 +235,40 @@ namespace cv { namespace gpu { namespace device
             };
 
             callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
         }
 
-        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
     } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.0.cu b/modules/gpu/src/cuda/row_filter.0.cu
new file mode 100644
index 0000000000..a1a8f36cad
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.0.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<uchar, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.1.cu b/modules/gpu/src/cuda/row_filter.1.cu
new file mode 100644
index 0000000000..ab2248e1b2
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.1.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<uchar3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.10.cu b/modules/gpu/src/cuda/row_filter.10.cu
new file mode 100644
index 0000000000..4509fe4a72
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.10.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<unsigned short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.11.cu b/modules/gpu/src/cuda/row_filter.11.cu
new file mode 100644
index 0000000000..839b56c7c0
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.11.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<ushort3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.12.cu b/modules/gpu/src/cuda/row_filter.12.cu
new file mode 100644
index 0000000000..c4879dbe4c
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.12.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<ushort4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.13.cu b/modules/gpu/src/cuda/row_filter.13.cu
new file mode 100644
index 0000000000..c5da957a58
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.13.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<int3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.14.cu b/modules/gpu/src/cuda/row_filter.14.cu
new file mode 100644
index 0000000000..d8ccb9a2ed
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.14.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<int4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.2.cu b/modules/gpu/src/cuda/row_filter.2.cu
new file mode 100644
index 0000000000..5aa2e2b80a
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.2.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.3.cu b/modules/gpu/src/cuda/row_filter.3.cu
new file mode 100644
index 0000000000..9d131a959d
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.3.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.4.cu b/modules/gpu/src/cuda/row_filter.4.cu
new file mode 100644
index 0000000000..0aae534ce7
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.4.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<int, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.5.cu b/modules/gpu/src/cuda/row_filter.5.cu
new file mode 100644
index 0000000000..dd1f2be135
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.5.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.6.cu b/modules/gpu/src/cuda/row_filter.6.cu
new file mode 100644
index 0000000000..548069d363
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.6.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.7.cu b/modules/gpu/src/cuda/row_filter.7.cu
new file mode 100644
index 0000000000..8c5c09ed93
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.7.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.8.cu b/modules/gpu/src/cuda/row_filter.8.cu
new file mode 100644
index 0000000000..77e4ed63f6
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.8.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.9.cu b/modules/gpu/src/cuda/row_filter.9.cu
new file mode 100644
index 0000000000..954944b007
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.9.cu
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<short4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu
deleted file mode 100644
index 39fc53fdc4..0000000000
--- a/modules/gpu/src/cuda/row_filter.cu
+++ /dev/null
@@ -1,390 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/static_check.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace row_filter
-    {
-        #define MAX_KERNEL_SIZE 32
-
-        __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
-        void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
-        {
-            if (stream == 0)
-                cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-            else
-                cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-        }
-
-        template <int KSIZE, typename T, typename D, typename B>
-        __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
-        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
-                const int BLOCK_DIM_X = 32;
-                const int BLOCK_DIM_Y = 8;
-                const int PATCH_PER_BLOCK = 4;
-                const int HALO_SIZE = 1;
-            #else
-                const int BLOCK_DIM_X = 32;
-                const int BLOCK_DIM_Y = 4;
-                const int PATCH_PER_BLOCK = 4;
-                const int HALO_SIZE = 1;
-            #endif
-
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-            __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
-
-            const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
-
-            if (y >= src.rows)
-                return;
-
-            const T* src_row = src.ptr(y);
-
-            const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
-
-            if (blockIdx.x > 0)
-            {
-                //Load left halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
-            }
-            else
-            {
-                //Load left halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
-            }
-
-            if (blockIdx.x + 2 < gridDim.x)
-            {
-                //Load main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
-
-                //Load right halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
-            }
-            else
-            {
-                //Load main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
-
-                //Load right halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
-            }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-            {
-                const int x = xStart + j * BLOCK_DIM_X;
-
-                if (x < src.cols)
-                {
-                    sum_t sum = VecTraits<sum_t>::all(0);
-
-                    #pragma unroll
-                    for (int k = 0; k < KSIZE; ++k)
-                        sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
-
-                    dst(y, x) = saturate_cast<D>(sum);
-                }
-            }
-        }
-
-        template <int KSIZE, typename T, typename D, template<typename> class B>
-        void linearRowFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
-        {
-            int BLOCK_DIM_X;
-            int BLOCK_DIM_Y;
-            int PATCH_PER_BLOCK;
-
-            if (cc >= 20)
-            {
-                BLOCK_DIM_X = 32;
-                BLOCK_DIM_Y = 8;
-                PATCH_PER_BLOCK = 4;
-            }
-            else
-            {
-                BLOCK_DIM_X = 32;
-                BLOCK_DIM_Y = 4;
-                PATCH_PER_BLOCK = 4;
-            }
-
-            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-            const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
-
-            B<T> brd(src.cols);
-
-            linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T, typename D>
-        void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
-
-            static const caller_t callers[5][33] =
-            {
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 2, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 3, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 4, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 5, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 6, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 7, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 8, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 9, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<10, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<11, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<12, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<13, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<14, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<15, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<16, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<17, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<18, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<19, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<20, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<21, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<22, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<23, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<24, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<25, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<26, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<27, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<28, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<29, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<30, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<31, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<32, T, D, BrdRowReflect101>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 2, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 3, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 4, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 5, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 6, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 7, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 8, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 9, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<10, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<11, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<12, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<13, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<14, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<15, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<16, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<17, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<18, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<19, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<20, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<21, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<22, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<23, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<24, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<25, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<26, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<27, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<28, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<29, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<30, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<31, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<32, T, D, BrdRowReplicate>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 2, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 3, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 4, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 5, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 6, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 7, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 8, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 9, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<10, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<11, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<12, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<13, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<14, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<15, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<16, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<17, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<18, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<19, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<20, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<21, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<22, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<23, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<24, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<25, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<26, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<27, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<28, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<29, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<30, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<31, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<32, T, D, BrdRowConstant>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 2, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 3, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 4, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 5, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 6, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 7, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 8, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 9, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<10, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<11, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<12, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<13, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<14, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<15, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<16, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<17, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<18, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<19, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<20, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<21, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<22, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<23, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<24, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<25, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<26, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<27, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<28, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<29, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<30, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<31, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<32, T, D, BrdRowReflect>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 2, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 3, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 4, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 5, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 6, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 7, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 8, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 9, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<10, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<11, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<12, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<13, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<14, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<15, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<16, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<17, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<18, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<19, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<20, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<21, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<22, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<23, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<24, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<25, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<26, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<27, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<28, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<29, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<30, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<31, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<32, T, D, BrdRowWrap>
-                }
-            };
-
-            loadKernel(kernel, ksize, stream);
-
-            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
-        }
-
-        template void linearRowFilter_gpu<uchar , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<uchar3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<int   , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-    } // namespace row_filter
-}}} // namespace cv { namespace gpu { namespace device
-
-
-#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.h b/modules/gpu/src/cuda/row_filter.h
new file mode 100644
index 0000000000..f2da684cc1
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.h
@@ -0,0 +1,372 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace row_filter
+{
+    #define MAX_KERNEL_SIZE 32
+
+    __constant__ float c_kernel[MAX_KERNEL_SIZE];
+
+    template <int KSIZE, typename T, typename D, typename B>
+    __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+    {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+            const int BLOCK_DIM_X = 32;
+            const int BLOCK_DIM_Y = 8;
+            const int PATCH_PER_BLOCK = 4;
+            const int HALO_SIZE = 1;
+        #else
+            const int BLOCK_DIM_X = 32;
+            const int BLOCK_DIM_Y = 4;
+            const int PATCH_PER_BLOCK = 4;
+            const int HALO_SIZE = 1;
+        #endif
+
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+        __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
+
+        const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+
+        if (y >= src.rows)
+            return;
+
+        const T* src_row = src.ptr(y);
+
+        const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
+
+        if (blockIdx.x > 0)
+        {
+            //Load left halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
+        }
+        else
+        {
+            //Load left halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
+        }
+
+        if (blockIdx.x + 2 < gridDim.x)
+        {
+            //Load main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
+
+            //Load right halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
+        }
+        else
+        {
+            //Load main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
+
+            //Load right halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
+        }
+
+        __syncthreads();
+
+        #pragma unroll
+        for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+        {
+            const int x = xStart + j * BLOCK_DIM_X;
+
+            if (x < src.cols)
+            {
+                sum_t sum = VecTraits<sum_t>::all(0);
+
+                #pragma unroll
+                for (int k = 0; k < KSIZE; ++k)
+                    sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
+
+                dst(y, x) = saturate_cast<D>(sum);
+            }
+        }
+    }
+
+    template <int KSIZE, typename T, typename D, template<typename> class B>
+    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+    {
+        int BLOCK_DIM_X;
+        int BLOCK_DIM_Y;
+        int PATCH_PER_BLOCK;
+
+        if (cc >= 20)
+        {
+            BLOCK_DIM_X = 32;
+            BLOCK_DIM_Y = 8;
+            PATCH_PER_BLOCK = 4;
+        }
+        else
+        {
+            BLOCK_DIM_X = 32;
+            BLOCK_DIM_Y = 4;
+            PATCH_PER_BLOCK = 4;
+        }
+
+        const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+        const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
+
+        B<T> brd(src.cols);
+
+        linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+namespace filter
+{
+    template <typename T, typename D>
+    void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+
+        static const caller_t callers[5][33] =
+        {
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowReflect101>,
+                row_filter::caller< 2, T, D, BrdRowReflect101>,
+                row_filter::caller< 3, T, D, BrdRowReflect101>,
+                row_filter::caller< 4, T, D, BrdRowReflect101>,
+                row_filter::caller< 5, T, D, BrdRowReflect101>,
+                row_filter::caller< 6, T, D, BrdRowReflect101>,
+                row_filter::caller< 7, T, D, BrdRowReflect101>,
+                row_filter::caller< 8, T, D, BrdRowReflect101>,
+                row_filter::caller< 9, T, D, BrdRowReflect101>,
+                row_filter::caller<10, T, D, BrdRowReflect101>,
+                row_filter::caller<11, T, D, BrdRowReflect101>,
+                row_filter::caller<12, T, D, BrdRowReflect101>,
+                row_filter::caller<13, T, D, BrdRowReflect101>,
+                row_filter::caller<14, T, D, BrdRowReflect101>,
+                row_filter::caller<15, T, D, BrdRowReflect101>,
+                row_filter::caller<16, T, D, BrdRowReflect101>,
+                row_filter::caller<17, T, D, BrdRowReflect101>,
+                row_filter::caller<18, T, D, BrdRowReflect101>,
+                row_filter::caller<19, T, D, BrdRowReflect101>,
+                row_filter::caller<20, T, D, BrdRowReflect101>,
+                row_filter::caller<21, T, D, BrdRowReflect101>,
+                row_filter::caller<22, T, D, BrdRowReflect101>,
+                row_filter::caller<23, T, D, BrdRowReflect101>,
+                row_filter::caller<24, T, D, BrdRowReflect101>,
+                row_filter::caller<25, T, D, BrdRowReflect101>,
+                row_filter::caller<26, T, D, BrdRowReflect101>,
+                row_filter::caller<27, T, D, BrdRowReflect101>,
+                row_filter::caller<28, T, D, BrdRowReflect101>,
+                row_filter::caller<29, T, D, BrdRowReflect101>,
+                row_filter::caller<30, T, D, BrdRowReflect101>,
+                row_filter::caller<31, T, D, BrdRowReflect101>,
+                row_filter::caller<32, T, D, BrdRowReflect101>
+            },
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowReplicate>,
+                row_filter::caller< 2, T, D, BrdRowReplicate>,
+                row_filter::caller< 3, T, D, BrdRowReplicate>,
+                row_filter::caller< 4, T, D, BrdRowReplicate>,
+                row_filter::caller< 5, T, D, BrdRowReplicate>,
+                row_filter::caller< 6, T, D, BrdRowReplicate>,
+                row_filter::caller< 7, T, D, BrdRowReplicate>,
+                row_filter::caller< 8, T, D, BrdRowReplicate>,
+                row_filter::caller< 9, T, D, BrdRowReplicate>,
+                row_filter::caller<10, T, D, BrdRowReplicate>,
+                row_filter::caller<11, T, D, BrdRowReplicate>,
+                row_filter::caller<12, T, D, BrdRowReplicate>,
+                row_filter::caller<13, T, D, BrdRowReplicate>,
+                row_filter::caller<14, T, D, BrdRowReplicate>,
+                row_filter::caller<15, T, D, BrdRowReplicate>,
+                row_filter::caller<16, T, D, BrdRowReplicate>,
+                row_filter::caller<17, T, D, BrdRowReplicate>,
+                row_filter::caller<18, T, D, BrdRowReplicate>,
+                row_filter::caller<19, T, D, BrdRowReplicate>,
+                row_filter::caller<20, T, D, BrdRowReplicate>,
+                row_filter::caller<21, T, D, BrdRowReplicate>,
+                row_filter::caller<22, T, D, BrdRowReplicate>,
+                row_filter::caller<23, T, D, BrdRowReplicate>,
+                row_filter::caller<24, T, D, BrdRowReplicate>,
+                row_filter::caller<25, T, D, BrdRowReplicate>,
+                row_filter::caller<26, T, D, BrdRowReplicate>,
+                row_filter::caller<27, T, D, BrdRowReplicate>,
+                row_filter::caller<28, T, D, BrdRowReplicate>,
+                row_filter::caller<29, T, D, BrdRowReplicate>,
+                row_filter::caller<30, T, D, BrdRowReplicate>,
+                row_filter::caller<31, T, D, BrdRowReplicate>,
+                row_filter::caller<32, T, D, BrdRowReplicate>
+            },
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowConstant>,
+                row_filter::caller< 2, T, D, BrdRowConstant>,
+                row_filter::caller< 3, T, D, BrdRowConstant>,
+                row_filter::caller< 4, T, D, BrdRowConstant>,
+                row_filter::caller< 5, T, D, BrdRowConstant>,
+                row_filter::caller< 6, T, D, BrdRowConstant>,
+                row_filter::caller< 7, T, D, BrdRowConstant>,
+                row_filter::caller< 8, T, D, BrdRowConstant>,
+                row_filter::caller< 9, T, D, BrdRowConstant>,
+                row_filter::caller<10, T, D, BrdRowConstant>,
+                row_filter::caller<11, T, D, BrdRowConstant>,
+                row_filter::caller<12, T, D, BrdRowConstant>,
+                row_filter::caller<13, T, D, BrdRowConstant>,
+                row_filter::caller<14, T, D, BrdRowConstant>,
+                row_filter::caller<15, T, D, BrdRowConstant>,
+                row_filter::caller<16, T, D, BrdRowConstant>,
+                row_filter::caller<17, T, D, BrdRowConstant>,
+                row_filter::caller<18, T, D, BrdRowConstant>,
+                row_filter::caller<19, T, D, BrdRowConstant>,
+                row_filter::caller<20, T, D, BrdRowConstant>,
+                row_filter::caller<21, T, D, BrdRowConstant>,
+                row_filter::caller<22, T, D, BrdRowConstant>,
+                row_filter::caller<23, T, D, BrdRowConstant>,
+                row_filter::caller<24, T, D, BrdRowConstant>,
+                row_filter::caller<25, T, D, BrdRowConstant>,
+                row_filter::caller<26, T, D, BrdRowConstant>,
+                row_filter::caller<27, T, D, BrdRowConstant>,
+                row_filter::caller<28, T, D, BrdRowConstant>,
+                row_filter::caller<29, T, D, BrdRowConstant>,
+                row_filter::caller<30, T, D, BrdRowConstant>,
+                row_filter::caller<31, T, D, BrdRowConstant>,
+                row_filter::caller<32, T, D, BrdRowConstant>
+            },
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowReflect>,
+                row_filter::caller< 2, T, D, BrdRowReflect>,
+                row_filter::caller< 3, T, D, BrdRowReflect>,
+                row_filter::caller< 4, T, D, BrdRowReflect>,
+                row_filter::caller< 5, T, D, BrdRowReflect>,
+                row_filter::caller< 6, T, D, BrdRowReflect>,
+                row_filter::caller< 7, T, D, BrdRowReflect>,
+                row_filter::caller< 8, T, D, BrdRowReflect>,
+                row_filter::caller< 9, T, D, BrdRowReflect>,
+                row_filter::caller<10, T, D, BrdRowReflect>,
+                row_filter::caller<11, T, D, BrdRowReflect>,
+                row_filter::caller<12, T, D, BrdRowReflect>,
+                row_filter::caller<13, T, D, BrdRowReflect>,
+                row_filter::caller<14, T, D, BrdRowReflect>,
+                row_filter::caller<15, T, D, BrdRowReflect>,
+                row_filter::caller<16, T, D, BrdRowReflect>,
+                row_filter::caller<17, T, D, BrdRowReflect>,
+                row_filter::caller<18, T, D, BrdRowReflect>,
+                row_filter::caller<19, T, D, BrdRowReflect>,
+                row_filter::caller<20, T, D, BrdRowReflect>,
+                row_filter::caller<21, T, D, BrdRowReflect>,
+                row_filter::caller<22, T, D, BrdRowReflect>,
+                row_filter::caller<23, T, D, BrdRowReflect>,
+                row_filter::caller<24, T, D, BrdRowReflect>,
+                row_filter::caller<25, T, D, BrdRowReflect>,
+                row_filter::caller<26, T, D, BrdRowReflect>,
+                row_filter::caller<27, T, D, BrdRowReflect>,
+                row_filter::caller<28, T, D, BrdRowReflect>,
+                row_filter::caller<29, T, D, BrdRowReflect>,
+                row_filter::caller<30, T, D, BrdRowReflect>,
+                row_filter::caller<31, T, D, BrdRowReflect>,
+                row_filter::caller<32, T, D, BrdRowReflect>
+            },
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowWrap>,
+                row_filter::caller< 2, T, D, BrdRowWrap>,
+                row_filter::caller< 3, T, D, BrdRowWrap>,
+                row_filter::caller< 4, T, D, BrdRowWrap>,
+                row_filter::caller< 5, T, D, BrdRowWrap>,
+                row_filter::caller< 6, T, D, BrdRowWrap>,
+                row_filter::caller< 7, T, D, BrdRowWrap>,
+                row_filter::caller< 8, T, D, BrdRowWrap>,
+                row_filter::caller< 9, T, D, BrdRowWrap>,
+                row_filter::caller<10, T, D, BrdRowWrap>,
+                row_filter::caller<11, T, D, BrdRowWrap>,
+                row_filter::caller<12, T, D, BrdRowWrap>,
+                row_filter::caller<13, T, D, BrdRowWrap>,
+                row_filter::caller<14, T, D, BrdRowWrap>,
+                row_filter::caller<15, T, D, BrdRowWrap>,
+                row_filter::caller<16, T, D, BrdRowWrap>,
+                row_filter::caller<17, T, D, BrdRowWrap>,
+                row_filter::caller<18, T, D, BrdRowWrap>,
+                row_filter::caller<19, T, D, BrdRowWrap>,
+                row_filter::caller<20, T, D, BrdRowWrap>,
+                row_filter::caller<21, T, D, BrdRowWrap>,
+                row_filter::caller<22, T, D, BrdRowWrap>,
+                row_filter::caller<23, T, D, BrdRowWrap>,
+                row_filter::caller<24, T, D, BrdRowWrap>,
+                row_filter::caller<25, T, D, BrdRowWrap>,
+                row_filter::caller<26, T, D, BrdRowWrap>,
+                row_filter::caller<27, T, D, BrdRowWrap>,
+                row_filter::caller<28, T, D, BrdRowWrap>,
+                row_filter::caller<29, T, D, BrdRowWrap>,
+                row_filter::caller<30, T, D, BrdRowWrap>,
+                row_filter::caller<31, T, D, BrdRowWrap>,
+                row_filter::caller<32, T, D, BrdRowWrap>
+            }
+        };
+
+        if (stream == 0)
+            cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
+        else
+            cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
+
+        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+    }
+}
diff --git a/modules/gpu/src/cuda/split_merge.cu b/modules/gpu/src/cuda/split_merge.cu
index a62c76bbef..834b283f0c 100644
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -508,4 +508,4 @@ namespace cv { namespace gpu { namespace device
 }}} // namespace cv { namespace gpu { namespace device
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/stereocsbp.cu b/modules/gpu/src/cuda/stereocsbp.cu
index 1c95ed9e10..7b76f478b4 100644
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@@ -42,9 +42,11 @@
 
 #if !defined CUDA_DISABLER
 
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/gpu/device/functional.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
@@ -297,28 +299,13 @@ namespace cv { namespace gpu { namespace device
                 }
 
                 extern __shared__ float smem[];
-                float* dline = smem + winsz * threadIdx.z;
 
-                dline[tid] = val;
-
-                __syncthreads();
-
-                if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
-                if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }
-
-                volatile float* vdline = smem + winsz * threadIdx.z;
-
-                if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
-                if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
-                if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];
-                if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];
-                if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];
-                if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];
+                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
 
                 T* data_cost = (T*)ctemp + y_out * cmsg_step + x_out;
 
                 if (tid == 0)
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
+                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
             }
         }
 
@@ -496,26 +483,11 @@ namespace cv { namespace gpu { namespace device
                 }
 
                 extern __shared__ float smem[];
-                float* dline = smem + winsz * threadIdx.z;
 
-                dline[tid] = val;
-
-                __syncthreads();
-
-                if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
-                if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid +  64]; } __syncthreads(); }
-
-                volatile float* vdline = smem + winsz * threadIdx.z;
-
-                if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
-                if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
-                if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];
-                if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];
-                if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];
-                if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];
+                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
 
                 if (tid == 0)
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
+                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
             }
         }
 
@@ -889,4 +861,4 @@ namespace cv { namespace gpu { namespace device
     } // namespace stereocsbp
 }}} // namespace cv { namespace gpu { namespace device {
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu
index 37c4eb48ab..5dc2c82104 100644
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -47,13 +47,13 @@
 
 #if !defined CUDA_DISABLER
 
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/functional.hpp"
 #include "opencv2/gpu/device/filters.hpp"
-#include <float.h>
 
 namespace cv { namespace gpu { namespace device
 {
@@ -568,7 +568,9 @@ namespace cv { namespace gpu { namespace device
 
             float bestx = 0, besty = 0, best_mod = 0;
 
+        #if __CUDA_ARCH__ >= 200
             #pragma unroll
+        #endif
             for (int i = 0; i < 18; ++i)
             {
                 const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
@@ -599,8 +601,9 @@ namespace cv { namespace gpu { namespace device
                     sumy += s_Y[threadIdx.x + 96];
                 }
 
-                device::reduce<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
-                device::reduce<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
+                plus<float> op;
+                device::reduce<32>(smem_tuple(s_sumx + threadIdx.y * 32, s_sumy + threadIdx.y * 32),
+                                   thrust::tie(sumx, sumy), threadIdx.x, thrust::make_tuple(op, op));
 
                 const float temp_mod = sumx * sumx + sumy * sumy;
                 if (temp_mod > best_mod)
@@ -638,7 +641,7 @@ namespace cv { namespace gpu { namespace device
                 kp_dir *= 180.0f / CV_PI_F;
 
                 kp_dir = 360.0f - kp_dir;
-                if (abs(kp_dir - 360.f) < FLT_EPSILON)
+                if (::fabsf(kp_dir - 360.f) < numeric_limits<float>::epsilon())
                     kp_dir = 0.f;
 
                 featureDir[blockIdx.x] = kp_dir;
@@ -697,11 +700,6 @@ namespace cv { namespace gpu { namespace device
         {
             typedef uchar elem_type;
 
-            __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) :
-                centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)
-            {
-            }
-
             __device__ __forceinline__ uchar operator ()(int i, int j) const
             {
                 float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
@@ -715,285 +713,215 @@ namespace cv { namespace gpu { namespace device
             float win_offset;
             float cos_dir;
             float sin_dir;
+            int width;
+            int height;
         };
 
-        __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25],
-            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
+                                   float& dx, float& dy)
         {
-            __shared__ float s_PATCH[6][6];
+            __shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];
 
-            const float centerX = featureX[blockIdx.x];
-            const float centerY = featureY[blockIdx.x];
-            const float size = featureSize[blockIdx.x];
-            float descriptor_dir = 360.0f - featureDir[blockIdx.x];
-            if (std::abs(descriptor_dir - 360.f) < FLT_EPSILON)
-                descriptor_dir = 0.f;
-            descriptor_dir *= (float)(CV_PI_F / 180.0f);
+            dx = dy = 0.0f;
 
-            /* The sampling intervals and wavelet sized for selecting an orientation
-             and building the keypoint descriptor are defined relative to 's' */
-            const float s = size * 1.2f / 9.0f;
+            WinReader win;
 
-            /* Extract a window of pixels around the keypoint of size 20s */
+            win.centerX = featureX[blockIdx.x];
+            win.centerY = featureY[blockIdx.x];
+
+            // The sampling intervals and wavelet sized for selecting an orientation
+            // and building the keypoint descriptor are defined relative to 's'
+            const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
+
+            // Extract a window of pixels around the keypoint of size 20s
             const int win_size = (int)((PATCH_SZ + 1) * s);
 
-            float sin_dir;
-            float cos_dir;
-            sincosf(descriptor_dir, &sin_dir, &cos_dir);
+            win.width = win.height = win_size;
 
-            /* Nearest neighbour version (faster) */
-            const float win_offset = -(float)(win_size - 1) / 2;
-
-            // Compute sampling points
-            // since grids are 2D, need to compute xBlock and yBlock indices
-            const int xBlock = (blockIdx.y & 3);  // blockIdx.y % 4
-            const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
-            const int xIndex = xBlock * 5 + threadIdx.x;
-            const int yIndex = yBlock * 5 + threadIdx.y;
-
-            const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
-            const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
-
-            LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));
-
-            s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);
-
-            __syncthreads();
-
-            if (threadIdx.x < 5 && threadIdx.y < 5)
-            {
-                const int tid = threadIdx.y * 5 + threadIdx.x;
-
-                const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
-
-                const float vx = (s_PATCH[threadIdx.y    ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x    ]) * dw;
-                const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x    ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y    ][threadIdx.x + 1]) * dw;
-
-                s_dx_bin[tid] = vx;
-                s_dy_bin[tid] = vy;
-            }
-        }
-
-        __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)
-        {
-            // first step is to reduce from 25 to 16
-            if (tid < 9) // use 9 threads
-            {
-                sdata1[tid] += sdata1[tid + 16];
-                sdata2[tid] += sdata2[tid + 16];
-                sdata3[tid] += sdata3[tid + 16];
-                sdata4[tid] += sdata4[tid + 16];
-            }
-
-            // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
-            if (tid < 8)
-            {
-                sdata1[tid] += sdata1[tid + 8];
-                sdata1[tid] += sdata1[tid + 4];
-                sdata1[tid] += sdata1[tid + 2];
-                sdata1[tid] += sdata1[tid + 1];
-
-                sdata2[tid] += sdata2[tid + 8];
-                sdata2[tid] += sdata2[tid + 4];
-                sdata2[tid] += sdata2[tid + 2];
-                sdata2[tid] += sdata2[tid + 1];
-
-                sdata3[tid] += sdata3[tid + 8];
-                sdata3[tid] += sdata3[tid + 4];
-                sdata3[tid] += sdata3[tid + 2];
-                sdata3[tid] += sdata3[tid + 1];
-
-                sdata4[tid] += sdata4[tid + 8];
-                sdata4[tid] += sdata4[tid + 4];
-                sdata4[tid] += sdata4[tid + 2];
-                sdata4[tid] += sdata4[tid + 1];
-            }
-        }
-
-        __global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
-        {
-            // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
-            __shared__ float sdx[25];
-            __shared__ float sdy[25];
-            __shared__ float sdxabs[25];
-            __shared__ float sdyabs[25];
-
-            calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
-            __syncthreads();
+            // Nearest neighbour version (faster)
+            win.win_offset = -(win_size - 1.0f) / 2.0f;
 
+            float descriptor_dir = 360.0f - featureDir[blockIdx.x];
+            if (::fabsf(descriptor_dir - 360.f) < numeric_limits<float>::epsilon())
+                descriptor_dir = 0.f;
+            descriptor_dir *= CV_PI_F / 180.0f;
+            sincosf(descriptor_dir, &win.sin_dir, &win.cos_dir);
 
             const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-            if (tid < 25)
+            const int xLoadInd = tid % (PATCH_SZ + 1);
+            const int yLoadInd = tid / (PATCH_SZ + 1);
+
+            if (yLoadInd < (PATCH_SZ + 1))
             {
-                sdxabs[tid] = ::fabs(sdx[tid]); // |dx| array
-                sdyabs[tid] = ::fabs(sdy[tid]); // |dy| array
-                __syncthreads();
-
-                reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
-                __syncthreads();
-
-                float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);
-
-                // write dx, dy, |dx|, |dy|
-                if (tid == 0)
+                if (s > 1)
                 {
-                    descriptors_block[0] = sdx[0];
-                    descriptors_block[1] = sdy[0];
-                    descriptors_block[2] = sdxabs[0];
-                    descriptors_block[3] = sdyabs[0];
+                    AreaFilter<WinReader> filter(win, s, s);
+                    s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd, xLoadInd);
                 }
+                else
+                {
+                    LinearFilter<WinReader> filter(win);
+                    s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd * s, xLoadInd * s);
+                }
+            }
+
+            __syncthreads();
+
+            const int xPatchInd = threadIdx.x % 5;
+            const int yPatchInd = threadIdx.x / 5;
+
+            if (yPatchInd < 5)
+            {
+                const int xBlockInd = threadIdx.y % 4;
+                const int yBlockInd = threadIdx.y / 4;
+
+                const int xInd = xBlockInd * 5 + xPatchInd;
+                const int yInd = yBlockInd * 5 + yPatchInd;
+
+                const float dw = c_DW[yInd * PATCH_SZ + xInd];
+
+                dx = (s_PATCH[yInd    ][xInd + 1] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd + 1][xInd    ]) * dw;
+                dy = (s_PATCH[yInd + 1][xInd    ] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd    ][xInd + 1]) * dw;
             }
         }
 
-        __global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        __global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
         {
-            // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
-            __shared__ float sdx[25];
-            __shared__ float sdy[25];
+            __shared__ float smem[32 * 16];
 
-            // sum (reduce) 5x5 area response
-            __shared__ float sd1[25];
-            __shared__ float sd2[25];
-            __shared__ float sdabs1[25];
-            __shared__ float sdabs2[25];
+            float* sRow = smem + threadIdx.y * 32;
 
-            calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
-            __syncthreads();
+            float dx, dy;
+            calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
 
-            const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+            float dxabs = ::fabsf(dx);
+            float dyabs = ::fabsf(dy);
 
-            if (tid < 25)
+            plus<float> op;
+
+            reduce<32>(sRow, dx, threadIdx.x, op);
+            reduce<32>(sRow, dy, threadIdx.x, op);
+            reduce<32>(sRow, dxabs, threadIdx.x, op);
+            reduce<32>(sRow, dyabs, threadIdx.x, op);
+
+            float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y;
+
+            // write dx, dy, |dx|, |dy|
+            if (threadIdx.x == 0)
+                *descriptors_block = make_float4(dx, dy, dxabs, dyabs);
+        }
+
+        __global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        {
+            __shared__ float smem[32 * 16];
+
+            float* sRow = smem + threadIdx.y * 32;
+
+            float dx, dy;
+            calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
+
+            float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;
+
+            plus<float> op;
+
+            float d1 = 0.0f;
+            float d2 = 0.0f;
+            float abs1 = 0.0f;
+            float abs2 = 0.0f;
+
+            if (dy >= 0)
             {
-                if (sdy[tid] >= 0)
-                {
-                    sd1[tid] = sdx[tid];
-                    sdabs1[tid] = ::fabs(sdx[tid]);
-                    sd2[tid] = 0;
-                    sdabs2[tid] = 0;
-                }
-                else
-                {
-                    sd1[tid] = 0;
-                    sdabs1[tid] = 0;
-                    sd2[tid] = sdx[tid];
-                    sdabs2[tid] = ::fabs(sdx[tid]);
-                }
-                __syncthreads();
-
-                reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-                __syncthreads();
-
-                float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
-
-                // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
-                if (tid == 0)
-                {
-                    descriptors_block[0] = sd1[0];
-                    descriptors_block[1] = sdabs1[0];
-                    descriptors_block[2] = sd2[0];
-                    descriptors_block[3] = sdabs2[0];
-                }
-                __syncthreads();
-
-                if (sdx[tid] >= 0)
-                {
-                    sd1[tid] = sdy[tid];
-                    sdabs1[tid] = ::fabs(sdy[tid]);
-                    sd2[tid] = 0;
-                    sdabs2[tid] = 0;
-                }
-                else
-                {
-                    sd1[tid] = 0;
-                    sdabs1[tid] = 0;
-                    sd2[tid] = sdy[tid];
-                    sdabs2[tid] = ::fabs(sdy[tid]);
-                }
-                __syncthreads();
-
-                reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-                __syncthreads();
-
-                // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
-                if (tid == 0)
-                {
-                    descriptors_block[4] = sd1[0];
-                    descriptors_block[5] = sdabs1[0];
-                    descriptors_block[6] = sd2[0];
-                    descriptors_block[7] = sdabs2[0];
-                }
+                d1 = dx;
+                abs1 = ::fabsf(dx);
             }
+            else
+            {
+                d2 = dx;
+                abs2 = ::fabsf(dx);
+            }
+
+            reduce<32>(sRow, d1, threadIdx.x, op);
+            reduce<32>(sRow, d2, threadIdx.x, op);
+            reduce<32>(sRow, abs1, threadIdx.x, op);
+            reduce<32>(sRow, abs2, threadIdx.x, op);
+
+            // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
+            if (threadIdx.x == 0)
+                descriptors_block[0] = make_float4(d1, abs1, d2, abs2);
+
+            if (dx >= 0)
+            {
+                d1 = dy;
+                abs1 = ::fabsf(dy);
+                d2 = 0.0f;
+                abs2 = 0.0f;
+            }
+            else
+            {
+                d1 = 0.0f;
+                abs1 = 0.0f;
+                d2 = dy;
+                abs2 = ::fabsf(dy);
+            }
+
+            reduce<32>(sRow, d1, threadIdx.x, op);
+            reduce<32>(sRow, d2, threadIdx.x, op);
+            reduce<32>(sRow, abs1, threadIdx.x, op);
+            reduce<32>(sRow, abs2, threadIdx.x, op);
+
+            // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
+            if (threadIdx.x == 0)
+                descriptors_block[1] = make_float4(d1, abs1, d2, abs2);
         }
 
         template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
         {
+            __shared__ float smem[BLOCK_DIM_X];
+            __shared__ float s_len;
+
             // no need for thread ID
             float* descriptor_base = descriptors.ptr(blockIdx.x);
 
             // read in the unnormalized descriptor values (squared)
-            __shared__ float sqDesc[BLOCK_DIM_X];
-            const float lookup = descriptor_base[threadIdx.x];
-            sqDesc[threadIdx.x] = lookup * lookup;
-            __syncthreads();
+            const float val = descriptor_base[threadIdx.x];
 
-            if (BLOCK_DIM_X >= 128)
-            {
-                if (threadIdx.x < 64)
-                    sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
-                __syncthreads();
-            }
+            float len = val * val;
+            reduce<BLOCK_DIM_X>(smem, len, threadIdx.x, plus<float>());
 
-            // reduction to get total
-            if (threadIdx.x < 32)
-            {
-                volatile float* smem = sqDesc;
-
-                smem[threadIdx.x] += smem[threadIdx.x + 32];
-                smem[threadIdx.x] += smem[threadIdx.x + 16];
-                smem[threadIdx.x] += smem[threadIdx.x + 8];
-                smem[threadIdx.x] += smem[threadIdx.x + 4];
-                smem[threadIdx.x] += smem[threadIdx.x + 2];
-                smem[threadIdx.x] += smem[threadIdx.x + 1];
-            }
-
-            // compute length (square root)
-            __shared__ float len;
             if (threadIdx.x == 0)
-            {
-                len = sqrtf(sqDesc[0]);
-            }
+                s_len = ::sqrtf(len);
+
             __syncthreads();
 
             // normalize and store in output
-            descriptor_base[threadIdx.x] = lookup / len;
+            descriptor_base[threadIdx.x] = val / s_len;
         }
 
-        void compute_descriptors_gpu(const PtrStepSzf& descriptors,
-            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
+        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
         {
             // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
 
             if (descriptors.cols == 64)
             {
-                compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
+                compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
                 cudaSafeCall( cudaGetLastError() );
 
                 cudaSafeCall( cudaDeviceSynchronize() );
 
-                normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
+                normalize_descriptors<64><<<nFeatures, 64>>>((PtrStepSzf) descriptors);
                 cudaSafeCall( cudaGetLastError() );
 
                 cudaSafeCall( cudaDeviceSynchronize() );
             }
             else
             {
-                compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
+                compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
                 cudaSafeCall( cudaGetLastError() );
 
                 cudaSafeCall( cudaDeviceSynchronize() );
 
-                normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
+                normalize_descriptors<128><<<nFeatures, 128>>>((PtrStepSzf) descriptors);
                 cudaSafeCall( cudaGetLastError() );
 
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -1003,4 +931,4 @@ namespace cv { namespace gpu { namespace device
 }}} // namespace cv { namespace gpu { namespace device
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/texture_binder.hpp b/modules/gpu/src/cuda/texture_binder.hpp
index 4f42b099dc..391eb9a190 100644
--- a/modules/gpu/src/cuda/texture_binder.hpp
+++ b/modules/gpu/src/cuda/texture_binder.hpp
@@ -85,7 +85,7 @@ namespace cv
 
   namespace device
   {
-      using pcl::gpu::TextureBinder;
+      using cv::gpu::TextureBinder;
   }
 }
 
diff --git a/modules/gpu/src/cuda/tvl1flow.cu b/modules/gpu/src/cuda/tvl1flow.cu
new file mode 100644
index 0000000000..27694ad269
--- /dev/null
+++ b/modules/gpu/src/cuda/tvl1flow.cu
@@ -0,0 +1,332 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+////////////////////////////////////////////////////////////
+// centeredGradient
+
+namespace tvl1flow
+{
+    __global__ void centeredGradientKernel(const PtrStepSzf src, PtrStepf dx, PtrStepf dy)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= src.cols || y >= src.rows)
+            return;
+
+        dx(y, x) = 0.5f * (src(y, ::min(x + 1, src.cols - 1)) - src(y, ::max(x - 1, 0)));
+        dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x));
+    }
+
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        centeredGradientKernel<<<grid, block>>>(src, dx, dy);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+////////////////////////////////////////////////////////////
+// warpBackward
+
+namespace tvl1flow
+{
+    static __device__ __forceinline__ float bicubicCoeff(float x_)
+    {
+        float x = fabsf(x_);
+        if (x <= 1.0f)
+        {
+            return x * x * (1.5f * x - 2.5f) + 1.0f;
+        }
+        else if (x < 2.0f)
+        {
+            return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+        }
+        else
+        {
+            return 0.0f;
+        }
+    }
+
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __global__ void warpBackwardKernel(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= I0.cols || y >= I0.rows)
+            return;
+
+        const float u1Val = u1(y, x);
+        const float u2Val = u2(y, x);
+
+        const float wx = x + u1Val;
+        const float wy = y + u2Val;
+
+        const int xmin = ::ceilf(wx - 2.0f);
+        const int xmax = ::floorf(wx + 2.0f);
+
+        const int ymin = ::ceilf(wy - 2.0f);
+        const int ymax = ::floorf(wy + 2.0f);
+
+        float sum  = 0.0f;
+        float sumx = 0.0f;
+        float sumy = 0.0f;
+        float wsum = 0.0f;
+
+        for (int cy = ymin; cy <= ymax; ++cy)
+        {
+            for (int cx = xmin; cx <= xmax; ++cx)
+            {
+                const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
+
+                sum  += w * tex2D(tex_I1 , cx, cy);
+                sumx += w * tex2D(tex_I1x, cx, cy);
+                sumy += w * tex2D(tex_I1y, cx, cy);
+
+                wsum += w;
+            }
+        }
+
+        const float coeff = 1.0f / wsum;
+
+        const float I1wVal  = sum  * coeff;
+        const float I1wxVal = sumx * coeff;
+        const float I1wyVal = sumy * coeff;
+
+        I1w(y, x)  = I1wVal;
+        I1wx(y, x) = I1wxVal;
+        I1wy(y, x) = I1wyVal;
+
+        const float Ix2 = I1wxVal * I1wxVal;
+        const float Iy2 = I1wyVal * I1wyVal;
+
+        // store the |Grad(I1)|^2
+        grad(y, x) = Ix2 + Iy2;
+
+        // compute the constant part of the rho function
+        const float I0Val = I0(y, x);
+        rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
+    }
+
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
+
+        bindTexture(&tex_I1 , I1);
+        bindTexture(&tex_I1x, I1x);
+        bindTexture(&tex_I1y, I1y);
+
+        warpBackwardKernel<<<grid, block>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+////////////////////////////////////////////////////////////
+// estimateU
+
+namespace tvl1flow
+{
+    __device__ float divergence(const PtrStepf& v1, const PtrStepf& v2, int y, int x)
+    {
+        if (x > 0 && y > 0)
+        {
+            const float v1x = v1(y, x) - v1(y, x - 1);
+            const float v2y = v2(y, x) - v2(y - 1, x);
+            return v1x + v2y;
+        }
+        else
+        {
+            if (y > 0)
+                return v1(y, 0) + v2(y, 0) - v2(y - 1, 0);
+            else
+            {
+                if (x > 0)
+                    return v1(0, x) - v1(0, x - 1) + v2(0, x);
+                else
+                    return v1(0, 0) + v2(0, 0);
+            }
+        }
+    }
+
+    __global__ void estimateUKernel(const PtrStepSzf I1wx, const PtrStepf I1wy,
+                              const PtrStepf grad, const PtrStepf rho_c,
+                              const PtrStepf p11, const PtrStepf p12, const PtrStepf p21, const PtrStepf p22,
+                              PtrStepf u1, PtrStepf u2, PtrStepf error,
+                              const float l_t, const float theta)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= I1wx.cols || y >= I1wx.rows)
+            return;
+
+        const float I1wxVal = I1wx(y, x);
+        const float I1wyVal = I1wy(y, x);
+        const float gradVal = grad(y, x);
+        const float u1OldVal = u1(y, x);
+        const float u2OldVal = u2(y, x);
+
+        const float rho = rho_c(y, x) + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);
+
+        // estimate the values of the variable (v1, v2) (thresholding operator TH)
+
+        float d1 = 0.0f;
+        float d2 = 0.0f;
+
+        if (rho < -l_t * gradVal)
+        {
+            d1 = l_t * I1wxVal;
+            d2 = l_t * I1wyVal;
+        }
+        else if (rho > l_t * gradVal)
+        {
+            d1 = -l_t * I1wxVal;
+            d2 = -l_t * I1wyVal;
+        }
+        else if (gradVal > numeric_limits<float>::epsilon())
+        {
+            const float fi = -rho / gradVal;
+            d1 = fi * I1wxVal;
+            d2 = fi * I1wyVal;
+        }
+
+        const float v1 = u1OldVal + d1;
+        const float v2 = u2OldVal + d2;
+
+        // compute the divergence of the dual variable (p1, p2)
+
+        const float div_p1 = divergence(p11, p12, y, x);
+        const float div_p2 = divergence(p21, p22, y, x);
+
+        // estimate the values of the optical flow (u1, u2)
+
+        const float u1NewVal = v1 + theta * div_p1;
+        const float u2NewVal = v2 + theta * div_p2;
+
+        u1(y, x) = u1NewVal;
+        u2(y, x) = u2NewVal;
+
+        const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
+        const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
+        error(y, x) = n1 + n2;
+    }
+
+    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
+                   PtrStepSzf grad, PtrStepSzf rho_c,
+                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22,
+                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error,
+                   float l_t, float theta)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));
+
+        estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, error, l_t, theta);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+////////////////////////////////////////////////////////////
+// estimateDualVariables
+
+namespace tvl1flow
+{
+    __global__ void estimateDualVariablesKernel(const PtrStepSzf u1, const PtrStepf u2, PtrStepf p11, PtrStepf p12, PtrStepf p21, PtrStepf p22, const float taut)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= u1.cols || y >= u1.rows)
+            return;
+
+        const float u1x = u1(y, ::min(x + 1, u1.cols - 1)) - u1(y, x);
+        const float u1y = u1(::min(y + 1, u1.rows - 1), x) - u1(y, x);
+
+        const float u2x = u2(y, ::min(x + 1, u1.cols - 1)) - u2(y, x);
+        const float u2y = u2(::min(y + 1, u1.rows - 1), x) - u2(y, x);
+
+        const float g1 = ::hypotf(u1x, u1y);
+        const float g2 = ::hypotf(u2x, u2y);
+
+        const float ng1 = 1.0f + taut * g1;
+        const float ng2 = 1.0f + taut * g2;
+
+        p11(y, x) = (p11(y, x) + taut * u1x) / ng1;
+        p12(y, x) = (p12(y, x) + taut * u1y) / ng1;
+        p21(y, x) = (p21(y, x) + taut * u2x) / ng2;
+        p22(y, x) = (p22(y, x) + taut * u2y) / ng2;
+    }
+
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));
+
+        estimateDualVariablesKernel<<<grid, block>>>(u1, u2, p11, p12, p21, p22, taut);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif // !defined CUDA_DISABLER
diff --git a/modules/gpu/src/cuda/warp.cu b/modules/gpu/src/cuda/warp.cu
index fbc2ec37e4..49130d9405 100644
--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
@@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace device
 
         template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
         {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
             {
                 typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
 
@@ -158,7 +158,7 @@ namespace cv { namespace gpu { namespace device
 
         template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
         {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, int)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, bool)
             {
                 (void)xoff;
                 (void)yoff;
@@ -195,10 +195,10 @@ namespace cv { namespace gpu { namespace device
             }; \
             template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
             { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, int cc) \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, bool cc20) \
                 { \
                     typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc >= 20 ? 8 : 4); \
+                    dim3 block(32, cc20 ? 8 : 4); \
                     dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
                     bindTexture(&tex_warp_ ## type , srcWhole); \
                     tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
@@ -212,7 +212,7 @@ namespace cv { namespace gpu { namespace device
             }; \
             template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
             { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, int) \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, bool) \
                 { \
                     dim3 block(32, 8); \
                     dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
@@ -263,20 +263,20 @@ namespace cv { namespace gpu { namespace device
 
         template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
         {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
             {
                 if (stream == 0)
-                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc);
+                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc20);
                 else
-                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc);
+                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc20);
             }
         };
 
         template <class Transform, typename T>
         void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
-                         int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+                         int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
         {
-            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
+            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
 
             static const func_t funcs[3][5] =
             {
@@ -304,86 +304,86 @@ namespace cv { namespace gpu { namespace device
             };
 
             funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
         }
 
         template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
         {
             cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
 
-            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
+            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
         }
 
-        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
         template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
         {
             cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
 
-            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
+            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
         }
 
-        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
     } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuvid_video_source.cpp b/modules/gpu/src/cuvid_video_source.cpp
index 5c46913539..7d45b8fee9 100644
--- a/modules/gpu/src/cuvid_video_source.cpp
+++ b/modules/gpu/src/cuvid_video_source.cpp
@@ -1,7 +1,7 @@
 #include "cuvid_video_source.h"
 #include "cu_safe_call.h"
 
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
 
 cv::gpu::detail::CuvidVideoSource::CuvidVideoSource(const std::string& fname)
 {
diff --git a/modules/gpu/src/cuvid_video_source.h b/modules/gpu/src/cuvid_video_source.h
index a06353fedd..1c4c0e5e00 100644
--- a/modules/gpu/src/cuvid_video_source.h
+++ b/modules/gpu/src/cuvid_video_source.h
@@ -45,7 +45,7 @@
 
 #include "precomp.hpp"
 
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
 
 namespace cv { namespace gpu
 {
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index 2a22b2ffb0..3d6cde377c 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -99,58 +99,6 @@ namespace
     template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; };
     template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; };
 
-    template <int DEPTH> struct NppArithmFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
-    };
-    template <> struct NppArithmFunc<CV_32F>
-    {
-        typedef NppTypeTraits<CV_32F>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
-    };
-
-    template <int DEPTH, typename NppArithmFunc<DEPTH>::func_t func> struct NppArithm
-    {
-        typedef typename NppArithmFunc<DEPTH>::npp_t npp_t;
-
-        static void call(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize sz;
-            sz.width  = src1.cols;
-            sz.height = src1.rows;
-
-            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step),
-                              (npp_t*)dst.data, static_cast<int>(dst.step), sz, 0) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template <typename NppArithmFunc<CV_32F>::func_t func> struct NppArithm<CV_32F, func>
-    {
-        typedef typename NppArithmFunc<CV_32F>::npp_t npp_t;
-
-        static void call(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize sz;
-            sz.width  = src1.cols;
-            sz.height = src1.rows;
-
-            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step),
-                              (npp_t*)dst.data, static_cast<int>(dst.step), sz) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
     template<int DEPTH, int cn> struct NppArithmScalarFunc
     {
         typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
@@ -313,87 +261,294 @@ namespace
 ////////////////////////////////////////////////////////////////////////
 // add
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
     template <typename T, typename D>
-    void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
     template <typename T, typename D>
-    void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-}}}
+    void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <typename T, typename D>
+    void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
 
 void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
-        {add_gpu<unsigned char, unsigned char>       , 0 /*add_gpu<unsigned char, signed char>*/ , add_gpu<unsigned char, unsigned short>    , add_gpu<unsigned char, short>       , add_gpu<unsigned char, int>    , add_gpu<unsigned char, float>    , add_gpu<unsigned char, double>    },
-        {0 /*add_gpu<signed char, unsigned char>*/   , 0 /*add_gpu<signed char, signed char>*/   , 0 /*add_gpu<signed char, unsigned short>*/, 0 /*add_gpu<signed char, short>*/   , 0 /*add_gpu<signed char, int>*/, 0 /*add_gpu<signed char, float>*/, 0 /*add_gpu<signed char, double>*/},
-        {0 /*add_gpu<unsigned short, unsigned char>*/, 0 /*add_gpu<unsigned short, signed char>*/, add_gpu<unsigned short, unsigned short>   , 0 /*add_gpu<unsigned short, short>*/, add_gpu<unsigned short, int>   , add_gpu<unsigned short, float>   , add_gpu<unsigned short, double>   },
-        {0 /*add_gpu<short, unsigned char>*/         , 0 /*add_gpu<short, signed char>*/         , 0 /*add_gpu<short, unsigned short>*/      , add_gpu<short, short>               , add_gpu<short, int>            , add_gpu<short, float>            , add_gpu<short, double>            },
-        {0 /*add_gpu<int, unsigned char>*/           , 0 /*add_gpu<int, signed char>*/           , 0 /*add_gpu<int, unsigned short>*/        , 0 /*add_gpu<int, short>*/           , add_gpu<int, int>              , add_gpu<int, float>              , add_gpu<int, double>              },
-        {0 /*add_gpu<float, unsigned char>*/         , 0 /*add_gpu<float, signed char>*/         , 0 /*add_gpu<float, unsigned short>*/      , 0 /*add_gpu<float, short>*/         , 0 /*add_gpu<float, int>*/      , add_gpu<float, float>            , add_gpu<float, double>            },
-        {0 /*add_gpu<double, unsigned char>*/        , 0 /*add_gpu<double, signed char>*/        , 0 /*add_gpu<double, unsigned short>*/     , 0 /*add_gpu<double, short>*/        , 0 /*add_gpu<double, int>*/     , 0 /*add_gpu<double, float>*/     , add_gpu<double, double>           }
+        {
+            addMat<unsigned char, unsigned char>,
+            addMat<unsigned char, signed char>,
+            addMat<unsigned char, unsigned short>,
+            addMat<unsigned char, short>,
+            addMat<unsigned char, int>,
+            addMat<unsigned char, float>,
+            addMat<unsigned char, double>
+        },
+        {
+            addMat<signed char, unsigned char>,
+            addMat<signed char, signed char>,
+            addMat<signed char, unsigned short>,
+            addMat<signed char, short>,
+            addMat<signed char, int>,
+            addMat<signed char, float>,
+            addMat<signed char, double>
+        },
+        {
+            0 /*addMat<unsigned short, unsigned char>*/,
+            0 /*addMat<unsigned short, signed char>*/,
+            addMat<unsigned short, unsigned short>,
+            addMat<unsigned short, short>,
+            addMat<unsigned short, int>,
+            addMat<unsigned short, float>,
+            addMat<unsigned short, double>
+        },
+        {
+            0 /*addMat<short, unsigned char>*/,
+            0 /*addMat<short, signed char>*/,
+            addMat<short, unsigned short>,
+            addMat<short, short>,
+            addMat<short, int>,
+            addMat<short, float>,
+            addMat<short, double>
+        },
+        {
+            0 /*addMat<int, unsigned char>*/,
+            0 /*addMat<int, signed char>*/,
+            0 /*addMat<int, unsigned short>*/,
+            0 /*addMat<int, short>*/,
+            addMat<int, int>,
+            addMat<int, float>,
+            addMat<int, double>
+        },
+        {
+            0 /*addMat<float, unsigned char>*/,
+            0 /*addMat<float, signed char>*/,
+            0 /*addMat<float, unsigned short>*/,
+            0 /*addMat<float, short>*/,
+            0 /*addMat<float, int>*/,
+            addMat<float, float>,
+            addMat<float, double>
+        },
+        {
+            0 /*addMat<double, unsigned char>*/,
+            0 /*addMat<double, signed char>*/,
+            0 /*addMat<double, unsigned short>*/,
+            0 /*addMat<double, short>*/,
+            0 /*addMat<double, int>*/,
+            0 /*addMat<double, float>*/,
+            addMat<double, double>
+        }
     };
 
-    typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
-    static const npp_func_t npp_funcs[] =
+    typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    static const vfunc_t vfuncs4[4][4] =
     {
-        NppArithm<CV_8U , nppiAdd_8u_C1RSfs >::call,
-        0,
-        NppArithm<CV_16U, nppiAdd_16u_C1RSfs>::call,
-        NppArithm<CV_16S, nppiAdd_16s_C1RSfs>::call,
-        NppArithm<CV_32S, nppiAdd_32s_C1RSfs>::call,
-        NppArithm<CV_32F, nppiAdd_32f_C1R   >::call
+        {
+            vadd4<unsigned int, unsigned int>,
+            vadd4<unsigned int, int>,
+            0,
+            0
+        },
+        {
+            vadd4<int, unsigned int>,
+            vadd4<int, int>,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            0,
+            0
+        }
+    };
+    static const vfunc_t vfuncs2[4][4] =
+    {
+        {
+            0,
+            0,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            vadd2<unsigned int, unsigned int>,
+            vadd2<unsigned int, int>
+        },
+        {
+            0,
+            0,
+            vadd2<int, unsigned int>,
+            vadd2<int, int>
+        }
     };
 
     if (dtype < 0)
         dtype = src1.depth();
 
-    CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-    CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());
-    CV_Assert(mask.empty() || (src1.channels() == 1 && mask.size() == src1.size() && mask.type() == CV_8U));
+    const int sdepth = src1.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int cn = src1.channels();
 
-    if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+    CV_Assert( mask.empty() || (cn == 1 && mask.size() == src1.size() && mask.type() == CV_8U) );
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));
+    dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    if (mask.empty() && dst.type() == src1.type() && src1.depth() <= CV_32F)
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+
+    if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
     {
-        npp_funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
-        return;
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        {
+            const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
+            const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
+
+            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            {
+                const int vcols = src1_.cols >> 2;
+
+                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+
+            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            {
+                const int vcols = src1_.cols >> 1;
+
+                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+        }
     }
 
-    const func_t func = funcs[src1.depth()][dst.depth()];
+    const func_t func = funcs[sdepth][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(src1.reshape(1), src2.reshape(1), dst.reshape(1), mask, stream);
+    func(src1_, src2_, dst_, mask, stream);
+}
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
 void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
-        {add_gpu<unsigned char, unsigned char>       , 0 /*add_gpu<unsigned char, signed char>*/ , add_gpu<unsigned char, unsigned short>    , add_gpu<unsigned char, short>       , add_gpu<unsigned char, int>    , add_gpu<unsigned char, float>    , add_gpu<unsigned char, double>    },
-        {0 /*add_gpu<signed char, unsigned char>*/   , 0 /*add_gpu<signed char, signed char>*/   , 0 /*add_gpu<signed char, unsigned short>*/, 0 /*add_gpu<signed char, short>*/   , 0 /*add_gpu<signed char, int>*/, 0 /*add_gpu<signed char, float>*/, 0 /*add_gpu<signed char, double>*/},
-        {0 /*add_gpu<unsigned short, unsigned char>*/, 0 /*add_gpu<unsigned short, signed char>*/, add_gpu<unsigned short, unsigned short>   , 0 /*add_gpu<unsigned short, short>*/, add_gpu<unsigned short, int>   , add_gpu<unsigned short, float>   , add_gpu<unsigned short, double>   },
-        {0 /*add_gpu<short, unsigned char>*/         , 0 /*add_gpu<short, signed char>*/         , 0 /*add_gpu<short, unsigned short>*/      , add_gpu<short, short>               , add_gpu<short, int>            , add_gpu<short, float>            , add_gpu<short, double>            },
-        {0 /*add_gpu<int, unsigned char>*/           , 0 /*add_gpu<int, signed char>*/           , 0 /*add_gpu<int, unsigned short>*/        , 0 /*add_gpu<int, short>*/           , add_gpu<int, int>              , add_gpu<int, float>              , add_gpu<int, double>              },
-        {0 /*add_gpu<float, unsigned char>*/         , 0 /*add_gpu<float, signed char>*/         , 0 /*add_gpu<float, unsigned short>*/      , 0 /*add_gpu<float, short>*/         , 0 /*add_gpu<float, int>*/      , add_gpu<float, float>            , add_gpu<float, double>            },
-        {0 /*add_gpu<double, unsigned char>*/        , 0 /*add_gpu<double, signed char>*/        , 0 /*add_gpu<double, unsigned short>*/     , 0 /*add_gpu<double, short>*/        , 0 /*add_gpu<double, int>*/     , 0 /*add_gpu<double, float>*/     , add_gpu<double, double>           }
+        {
+            addScalar<unsigned char, float, unsigned char>,
+            addScalar<unsigned char, float, signed char>,
+            addScalar<unsigned char, float, unsigned short>,
+            addScalar<unsigned char, float, short>,
+            addScalar<unsigned char, float, int>,
+            addScalar<unsigned char, float, float>,
+            addScalar<unsigned char, double, double>
+        },
+        {
+            addScalar<signed char, float, unsigned char>,
+            addScalar<signed char, float, signed char>,
+            addScalar<signed char, float, unsigned short>,
+            addScalar<signed char, float, short>,
+            addScalar<signed char, float, int>,
+            addScalar<signed char, float, float>,
+            addScalar<signed char, double, double>
+        },
+        {
+            0 /*addScalar<unsigned short, float, unsigned char>*/,
+            0 /*addScalar<unsigned short, float, signed char>*/,
+            addScalar<unsigned short, float, unsigned short>,
+            addScalar<unsigned short, float, short>,
+            addScalar<unsigned short, float, int>,
+            addScalar<unsigned short, float, float>,
+            addScalar<unsigned short, double, double>
+        },
+        {
+            0 /*addScalar<short, float, unsigned char>*/,
+            0 /*addScalar<short, float, signed char>*/,
+            addScalar<short, float, unsigned short>,
+            addScalar<short, float, short>,
+            addScalar<short, float, int>,
+            addScalar<short, float, float>,
+            addScalar<short, double, double>
+        },
+        {
+            0 /*addScalar<int, float, unsigned char>*/,
+            0 /*addScalar<int, float, signed char>*/,
+            0 /*addScalar<int, float, unsigned short>*/,
+            0 /*addScalar<int, float, short>*/,
+            addScalar<int, float, int>,
+            addScalar<int, float, float>,
+            addScalar<int, double, double>
+        },
+        {
+            0 /*addScalar<float, float, unsigned char>*/,
+            0 /*addScalar<float, float, signed char>*/,
+            0 /*addScalar<float, float, unsigned short>*/,
+            0 /*addScalar<float, float, short>*/,
+            0 /*addScalar<float, float, int>*/,
+            addScalar<float, float, float>,
+            addScalar<float, double, double>
+        },
+        {
+            0 /*addScalar<double, double, unsigned char>*/,
+            0 /*addScalar<double, double, signed char>*/,
+            0 /*addScalar<double, double, unsigned short>*/,
+            0 /*addScalar<double, double, short>*/,
+            0 /*addScalar<double, double, int>*/,
+            0 /*addScalar<double, double, float>*/,
+            addScalar<double, double, double>
+        }
     };
 
     typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
@@ -411,34 +566,34 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
     if (dtype < 0)
         dtype = src.depth();
 
-    CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-    CV_Assert(src.channels() <= 4);
-    CV_Assert(mask.empty() || (src.channels() == 1 && mask.size() == src.size() && mask.type() == CV_8U));
+    const int sdepth = src.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int cn = src.channels();
 
-    if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( cn <= 4 );
+    CV_Assert( mask.empty() || (cn == 1 && mask.size() == src.size() && mask.type() == CV_8U) );
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    if (mask.empty() && dst.type() == src.type())
+    const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
+    if (ddepth == sdepth && cn > 1 && npp_func != 0)
     {
-        const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1];
-
-        if (npp_func)
-        {
-            npp_func(src, sc, dst, stream);
-            return;
-        }
+        npp_func(src, sc, dst, stream);
+        return;
     }
 
-    CV_Assert(src.channels() == 1);
+    CV_Assert( cn == 1 );
 
-    const func_t func = funcs[src.depth()][dst.depth()];
+    const func_t func = funcs[sdepth][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
@@ -449,87 +604,294 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
 ////////////////////////////////////////////////////////////////////////
 // subtract
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
     template <typename T, typename D>
-    void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
     template <typename T, typename D>
-    void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-}}}
+    void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <typename T, typename D>
+    void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
 
 void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
-        {subtract_gpu<unsigned char, unsigned char>       , 0 /*subtract_gpu<unsigned char, signed char>*/ , subtract_gpu<unsigned char, unsigned short>    , subtract_gpu<unsigned char, short>       , subtract_gpu<unsigned char, int>    , subtract_gpu<unsigned char, float>    , subtract_gpu<unsigned char, double>    },
-        {0 /*subtract_gpu<signed char, unsigned char>*/   , 0 /*subtract_gpu<signed char, signed char>*/   , 0 /*subtract_gpu<signed char, unsigned short>*/, 0 /*subtract_gpu<signed char, short>*/   , 0 /*subtract_gpu<signed char, int>*/, 0 /*subtract_gpu<signed char, float>*/, 0 /*subtract_gpu<signed char, double>*/},
-        {0 /*subtract_gpu<unsigned short, unsigned char>*/, 0 /*subtract_gpu<unsigned short, signed char>*/, subtract_gpu<unsigned short, unsigned short>   , 0 /*subtract_gpu<unsigned short, short>*/, subtract_gpu<unsigned short, int>   , subtract_gpu<unsigned short, float>   , subtract_gpu<unsigned short, double>   },
-        {0 /*subtract_gpu<short, unsigned char>*/         , 0 /*subtract_gpu<short, signed char>*/         , 0 /*subtract_gpu<short, unsigned short>*/      , subtract_gpu<short, short>               , subtract_gpu<short, int>            , subtract_gpu<short, float>            , subtract_gpu<short, double>            },
-        {0 /*subtract_gpu<int, unsigned char>*/           , 0 /*subtract_gpu<int, signed char>*/           , 0 /*subtract_gpu<int, unsigned short>*/        , 0 /*subtract_gpu<int, short>*/           , subtract_gpu<int, int>              , subtract_gpu<int, float>              , subtract_gpu<int, double>              },
-        {0 /*subtract_gpu<float, unsigned char>*/         , 0 /*subtract_gpu<float, signed char>*/         , 0 /*subtract_gpu<float, unsigned short>*/      , 0 /*subtract_gpu<float, short>*/         , 0 /*subtract_gpu<float, int>*/      , subtract_gpu<float, float>            , subtract_gpu<float, double>            },
-        {0 /*subtract_gpu<double, unsigned char>*/        , 0 /*subtract_gpu<double, signed char>*/        , 0 /*subtract_gpu<double, unsigned short>*/     , 0 /*subtract_gpu<double, short>*/        , 0 /*subtract_gpu<double, int>*/     , 0 /*subtract_gpu<double, float>*/     , subtract_gpu<double, double>           }
+        {
+            subMat<unsigned char, unsigned char>,
+            subMat<unsigned char, signed char>,
+            subMat<unsigned char, unsigned short>,
+            subMat<unsigned char, short>,
+            subMat<unsigned char, int>,
+            subMat<unsigned char, float>,
+            subMat<unsigned char, double>
+        },
+        {
+            subMat<signed char, unsigned char>,
+            subMat<signed char, signed char>,
+            subMat<signed char, unsigned short>,
+            subMat<signed char, short>,
+            subMat<signed char, int>,
+            subMat<signed char, float>,
+            subMat<signed char, double>
+        },
+        {
+            0 /*subMat<unsigned short, unsigned char>*/,
+            0 /*subMat<unsigned short, signed char>*/,
+            subMat<unsigned short, unsigned short>,
+            subMat<unsigned short, short>,
+            subMat<unsigned short, int>,
+            subMat<unsigned short, float>,
+            subMat<unsigned short, double>
+        },
+        {
+            0 /*subMat<short, unsigned char>*/,
+            0 /*subMat<short, signed char>*/,
+            subMat<short, unsigned short>,
+            subMat<short, short>,
+            subMat<short, int>,
+            subMat<short, float>,
+            subMat<short, double>
+        },
+        {
+            0 /*subMat<int, unsigned char>*/,
+            0 /*subMat<int, signed char>*/,
+            0 /*subMat<int, unsigned short>*/,
+            0 /*subMat<int, short>*/,
+            subMat<int, int>,
+            subMat<int, float>,
+            subMat<int, double>
+        },
+        {
+            0 /*subMat<float, unsigned char>*/,
+            0 /*subMat<float, signed char>*/,
+            0 /*subMat<float, unsigned short>*/,
+            0 /*subMat<float, short>*/,
+            0 /*subMat<float, int>*/,
+            subMat<float, float>,
+            subMat<float, double>
+        },
+        {
+            0 /*subMat<double, unsigned char>*/,
+            0 /*subMat<double, signed char>*/,
+            0 /*subMat<double, unsigned short>*/,
+            0 /*subMat<double, short>*/,
+            0 /*subMat<double, int>*/,
+            0 /*subMat<double, float>*/,
+            subMat<double, double>
+        }
     };
 
-    typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
-    static const npp_func_t npp_funcs[6] =
+    typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    static const vfunc_t vfuncs4[4][4] =
     {
-        NppArithm<CV_8U , nppiSub_8u_C1RSfs>::call,
-        0,
-        NppArithm<CV_16U, nppiSub_16u_C1RSfs>::call,
-        NppArithm<CV_16S, nppiSub_16s_C1RSfs>::call,
-        NppArithm<CV_32S, nppiSub_32s_C1RSfs>::call,
-        NppArithm<CV_32F, nppiSub_32f_C1R   >::call
+        {
+            vsub4<unsigned int, unsigned int>,
+            vsub4<unsigned int, int>,
+            0,
+            0
+        },
+        {
+            vsub4<int, unsigned int>,
+            vsub4<int, int>,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            0,
+            0
+        }
+    };
+    static const vfunc_t vfuncs2[4][4] =
+    {
+        {
+            0,
+            0,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            vsub2<unsigned int, unsigned int>,
+            vsub2<unsigned int, int>
+        },
+        {
+            0,
+            0,
+            vsub2<int, unsigned int>,
+            vsub2<int, int>
+        }
     };
 
     if (dtype < 0)
         dtype = src1.depth();
 
-    CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-    CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());
-    CV_Assert(mask.empty() || (src1.channels() == 1 && mask.size() == src1.size() && mask.type() == CV_8U));
+    const int sdepth = src1.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int cn = src1.channels();
 
-    if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+    CV_Assert( mask.empty() || (cn == 1 && mask.size() == src1.size() && mask.type() == CV_8U) );
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));
+    dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    if (mask.empty() && dst.type() == src1.type() && src1.depth() <= CV_32F)
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+
+    if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
     {
-        npp_funcs[src1.depth()](src2.reshape(1), src1.reshape(1), dst.reshape(1), stream);
-        return;
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        {
+            const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
+            const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
+
+            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            {
+                const int vcols = src1_.cols >> 2;
+
+                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+
+            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            {
+                const int vcols = src1_.cols >> 1;
+
+                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+        }
     }
 
-    const func_t func = funcs[src1.depth()][dst.depth()];
+    const func_t func = funcs[sdepth][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(src1.reshape(1), src2.reshape(1), dst.reshape(1), mask, stream);
+    func(src1_, src2_, dst_, mask, stream);
+}
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
 void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
-        {subtract_gpu<unsigned char, unsigned char>       , 0 /*subtract_gpu<unsigned char, signed char>*/ , subtract_gpu<unsigned char, unsigned short>    , subtract_gpu<unsigned char, short>       , subtract_gpu<unsigned char, int>    , subtract_gpu<unsigned char, float>    , subtract_gpu<unsigned char, double>    },
-        {0 /*subtract_gpu<signed char, unsigned char>*/   , 0 /*subtract_gpu<signed char, signed char>*/   , 0 /*subtract_gpu<signed char, unsigned short>*/, 0 /*subtract_gpu<signed char, short>*/   , 0 /*subtract_gpu<signed char, int>*/, 0 /*subtract_gpu<signed char, float>*/, 0 /*subtract_gpu<signed char, double>*/},
-        {0 /*subtract_gpu<unsigned short, unsigned char>*/, 0 /*subtract_gpu<unsigned short, signed char>*/, subtract_gpu<unsigned short, unsigned short>   , 0 /*subtract_gpu<unsigned short, short>*/, subtract_gpu<unsigned short, int>   , subtract_gpu<unsigned short, float>   , subtract_gpu<unsigned short, double>   },
-        {0 /*subtract_gpu<short, unsigned char>*/         , 0 /*subtract_gpu<short, signed char>*/         , 0 /*subtract_gpu<short, unsigned short>*/      , subtract_gpu<short, short>               , subtract_gpu<short, int>            , subtract_gpu<short, float>            , subtract_gpu<short, double>            },
-        {0 /*subtract_gpu<int, unsigned char>*/           , 0 /*subtract_gpu<int, signed char>*/           , 0 /*subtract_gpu<int, unsigned short>*/        , 0 /*subtract_gpu<int, short>*/           , subtract_gpu<int, int>              , subtract_gpu<int, float>              , subtract_gpu<int, double>              },
-        {0 /*subtract_gpu<float, unsigned char>*/         , 0 /*subtract_gpu<float, signed char>*/         , 0 /*subtract_gpu<float, unsigned short>*/      , 0 /*subtract_gpu<float, short>*/         , 0 /*subtract_gpu<float, int>*/      , subtract_gpu<float, float>            , subtract_gpu<float, double>            },
-        {0 /*subtract_gpu<double, unsigned char>*/        , 0 /*subtract_gpu<double, signed char>*/        , 0 /*subtract_gpu<double, unsigned short>*/     , 0 /*subtract_gpu<double, short>*/        , 0 /*subtract_gpu<double, int>*/     , 0 /*subtract_gpu<double, float>*/     , subtract_gpu<double, double>           }
+        {
+            subScalar<unsigned char, float, unsigned char>,
+            subScalar<unsigned char, float, signed char>,
+            subScalar<unsigned char, float, unsigned short>,
+            subScalar<unsigned char, float, short>,
+            subScalar<unsigned char, float, int>,
+            subScalar<unsigned char, float, float>,
+            subScalar<unsigned char, double, double>
+        },
+        {
+            subScalar<signed char, float, unsigned char>,
+            subScalar<signed char, float, signed char>,
+            subScalar<signed char, float, unsigned short>,
+            subScalar<signed char, float, short>,
+            subScalar<signed char, float, int>,
+            subScalar<signed char, float, float>,
+            subScalar<signed char, double, double>
+        },
+        {
+            0 /*subScalar<unsigned short, float, unsigned char>*/,
+            0 /*subScalar<unsigned short, float, signed char>*/,
+            subScalar<unsigned short, float, unsigned short>,
+            subScalar<unsigned short, float, short>,
+            subScalar<unsigned short, float, int>,
+            subScalar<unsigned short, float, float>,
+            subScalar<unsigned short, double, double>
+        },
+        {
+            0 /*subScalar<short, float, unsigned char>*/,
+            0 /*subScalar<short, float, signed char>*/,
+            subScalar<short, float, unsigned short>,
+            subScalar<short, float, short>,
+            subScalar<short, float, int>,
+            subScalar<short, float, float>,
+            subScalar<short, double, double>
+        },
+        {
+            0 /*subScalar<int, float, unsigned char>*/,
+            0 /*subScalar<int, float, signed char>*/,
+            0 /*subScalar<int, float, unsigned short>*/,
+            0 /*subScalar<int, float, short>*/,
+            subScalar<int, float, int>,
+            subScalar<int, float, float>,
+            subScalar<int, double, double>
+        },
+        {
+            0 /*subScalar<float, float, unsigned char>*/,
+            0 /*subScalar<float, float, signed char>*/,
+            0 /*subScalar<float, float, unsigned short>*/,
+            0 /*subScalar<float, float, short>*/,
+            0 /*subScalar<float, float, int>*/,
+            subScalar<float, float, float>,
+            subScalar<float, double, double>
+        },
+        {
+            0 /*subScalar<double, double, unsigned char>*/,
+            0 /*subScalar<double, double, signed char>*/,
+            0 /*subScalar<double, double, unsigned short>*/,
+            0 /*subScalar<double, double, short>*/,
+            0 /*subScalar<double, double, int>*/,
+            0 /*subScalar<double, double, float>*/,
+            subScalar<double, double, double>
+        }
     };
 
     typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
@@ -547,34 +909,34 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
     if (dtype < 0)
         dtype = src.depth();
 
-    CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-    CV_Assert(src.channels() <= 4);
-    CV_Assert(mask.empty() || (src.channels() == 1 && mask.size() == src.size() && mask.type() == CV_8U));
+    const int sdepth = src.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int cn = src.channels();
 
-    if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( cn <= 4 );
+    CV_Assert( mask.empty() || (cn == 1 && mask.size() == src.size() && mask.type() == CV_8U) );
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    if (mask.empty() && dst.type() == src.type())
+    const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
+    if (ddepth == sdepth && cn > 1 && npp_func != 0)
     {
-        const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1];
-
-        if (npp_func)
-        {
-            npp_func(src, sc, dst, stream);
-            return;
-        }
+        npp_func(src, sc, dst, stream);
+        return;
     }
 
-    CV_Assert(src.channels() == 1);
+    CV_Assert( cn == 1 );
 
-    const func_t func = funcs[src.depth()][dst.depth()];
+    const func_t func = funcs[sdepth][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
@@ -585,120 +947,215 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
 ////////////////////////////////////////////////////////////////////////
 // multiply
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
-    void multiply_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream);
-    void multiply_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream);
+    void mulMat_8uc4_32f(PtrStepSz<unsigned int> src1, PtrStepSzf src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
 
-    template <typename T, typename D>
-    void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    void mulMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream);
 
-    template <typename T, typename D>
-    void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-}}}
+    template <typename T, typename S, typename D>
+    void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+}
 
 void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
     if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1)
     {
-        CV_Assert(src1.size() == src2.size());
+        CV_Assert( src1.size() == src2.size() );
 
         dst.create(src1.size(), src1.type());
 
-        multiply_gpu(static_cast<PtrStepSz<uchar4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<uchar4> >(dst), stream);
+        mulMat_8uc4_32f(src1, src2, dst, stream);
     }
     else if (src1.type() == CV_16SC4 && src2.type() == CV_32FC1)
     {
-        CV_Assert(src1.size() == src2.size());
+        CV_Assert( src1.size() == src2.size() );
 
         dst.create(src1.size(), src1.type());
 
-        multiply_gpu(static_cast<PtrStepSz<short4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<short4> >(dst), stream);
+        mulMat_16sc4_32f(src1, src2, dst, stream);
     }
     else
     {
-        typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+        typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
         static const func_t funcs[7][7] =
         {
-            {multiply_gpu<unsigned char, unsigned char>       , 0 /*multiply_gpu<unsigned char, signed char>*/ , multiply_gpu<unsigned char, unsigned short>    , multiply_gpu<unsigned char, short>       , multiply_gpu<unsigned char, int>    , multiply_gpu<unsigned char, float>    , multiply_gpu<unsigned char, double>    },
-            {0 /*multiply_gpu<signed char, unsigned char>*/   , 0 /*multiply_gpu<signed char, signed char>*/   , 0 /*multiply_gpu<signed char, unsigned short>*/, 0 /*multiply_gpu<signed char, short>*/   , 0 /*multiply_gpu<signed char, int>*/, 0 /*multiply_gpu<signed char, float>*/, 0 /*multiply_gpu<signed char, double>*/},
-            {0 /*multiply_gpu<unsigned short, unsigned char>*/, 0 /*multiply_gpu<unsigned short, signed char>*/, multiply_gpu<unsigned short, unsigned short>   , 0 /*multiply_gpu<unsigned short, short>*/, multiply_gpu<unsigned short, int>   , multiply_gpu<unsigned short, float>   , multiply_gpu<unsigned short, double>   },
-            {0 /*multiply_gpu<short, unsigned char>*/         , 0 /*multiply_gpu<short, signed char>*/         , 0 /*multiply_gpu<short, unsigned short>*/      , multiply_gpu<short, short>               , multiply_gpu<short, int>            , multiply_gpu<short, float>            , multiply_gpu<short, double>            },
-            {0 /*multiply_gpu<int, unsigned char>*/           , 0 /*multiply_gpu<int, signed char>*/           , 0 /*multiply_gpu<int, unsigned short>*/        , 0 /*multiply_gpu<int, short>*/           , multiply_gpu<int, int>              , multiply_gpu<int, float>              , multiply_gpu<int, double>              },
-            {0 /*multiply_gpu<float, unsigned char>*/         , 0 /*multiply_gpu<float, signed char>*/         , 0 /*multiply_gpu<float, unsigned short>*/      , 0 /*multiply_gpu<float, short>*/         , 0 /*multiply_gpu<float, int>*/      , multiply_gpu<float, float>            , multiply_gpu<float, double>            },
-            {0 /*multiply_gpu<double, unsigned char>*/        , 0 /*multiply_gpu<double, signed char>*/        , 0 /*multiply_gpu<double, unsigned short>*/     , 0 /*multiply_gpu<double, short>*/        , 0 /*multiply_gpu<double, int>*/     , 0 /*multiply_gpu<double, float>*/     , multiply_gpu<double, double>           }
-        };
-
-        typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
-        static const npp_func_t npp_funcs[] =
-        {
-            NppArithm<CV_8U , nppiMul_8u_C1RSfs >::call,
-            0,
-            NppArithm<CV_16U, nppiMul_16u_C1RSfs>::call,
-            NppArithm<CV_16S, nppiMul_16s_C1RSfs>::call,
-            NppArithm<CV_32S, nppiMul_32s_C1RSfs>::call,
-            NppArithm<CV_32F, nppiMul_32f_C1R   >::call
+            {
+                mulMat<unsigned char, float, unsigned char>,
+                mulMat<unsigned char, float, signed char>,
+                mulMat<unsigned char, float, unsigned short>,
+                mulMat<unsigned char, float, short>,
+                mulMat<unsigned char, float, int>,
+                mulMat<unsigned char, float, float>,
+                mulMat<unsigned char, double, double>
+            },
+            {
+                mulMat<signed char, float, unsigned char>,
+                mulMat<signed char, float, signed char>,
+                mulMat<signed char, float, unsigned short>,
+                mulMat<signed char, float, short>,
+                mulMat<signed char, float, int>,
+                mulMat<signed char, float, float>,
+                mulMat<signed char, double, double>
+            },
+            {
+                0 /*mulMat<unsigned short, float, unsigned char>*/,
+                0 /*mulMat<unsigned short, float, signed char>*/,
+                mulMat<unsigned short, float, unsigned short>,
+                mulMat<unsigned short, float, short>,
+                mulMat<unsigned short, float, int>,
+                mulMat<unsigned short, float, float>,
+                mulMat<unsigned short, double, double>
+            },
+            {
+                0 /*mulMat<short, float, unsigned char>*/,
+                0 /*mulMat<short, float, signed char>*/,
+                mulMat<short, float, unsigned short>,
+                mulMat<short, float, short>,
+                mulMat<short, float, int>,
+                mulMat<short, float, float>,
+                mulMat<short, double, double>
+            },
+            {
+                0 /*mulMat<int, float, unsigned char>*/,
+                0 /*mulMat<int, float, signed char>*/,
+                0 /*mulMat<int, float, unsigned short>*/,
+                0 /*mulMat<int, float, short>*/,
+                mulMat<int, float, int>,
+                mulMat<int, float, float>,
+                mulMat<int, double, double>
+            },
+            {
+                0 /*mulMat<float, float, unsigned char>*/,
+                0 /*mulMat<float, float, signed char>*/,
+                0 /*mulMat<float, float, unsigned short>*/,
+                0 /*mulMat<float, float, short>*/,
+                0 /*mulMat<float, float, int>*/,
+                mulMat<float, float, float>,
+                mulMat<float, double, double>
+            },
+            {
+                0 /*mulMat<double, double, unsigned char>*/,
+                0 /*mulMat<double, double, signed char>*/,
+                0 /*mulMat<double, double, unsigned short>*/,
+                0 /*mulMat<double, double, short>*/,
+                0 /*mulMat<double, double, int>*/,
+                0 /*mulMat<double, double, float>*/,
+                mulMat<double, double, double>
+            }
         };
 
         if (dtype < 0)
             dtype = src1.depth();
 
-        CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-        CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());
+        const int sdepth = src1.depth();
+        const int ddepth = CV_MAT_DEPTH(dtype);
+        const int cn = src1.channels();
 
-        if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+        CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+        CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+
+        if (sdepth == CV_64F || ddepth == CV_64F)
         {
-            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            if (!deviceSupports(NATIVE_DOUBLE))
                 CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
         }
 
-        dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));
+        dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
 
-#if (CUDA_VERSION <= 4020)
-        if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F)
-#else
-        if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F && src1.depth() > CV_8U)
-#endif
-        {
-            npp_funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
-            return;
-        }
+        PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+        PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+        PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-        const func_t func = funcs[src1.depth()][dst.depth()];
+        const func_t func = funcs[sdepth][ddepth];
 
         if (!func)
             CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-        func(src1.reshape(1), src2.reshape(1), dst.reshape(1), scale, stream);
+        func(src1_, src2_, dst_, scale, stream);
     }
 }
 
-namespace
+namespace arithm
 {
-    inline bool isIntScalar(Scalar sc)
-    {
-        return sc.val[0] == static_cast<int>(sc.val[0]) && sc.val[1] == static_cast<int>(sc.val[1]) && sc.val[2] == static_cast<int>(sc.val[2]) && sc.val[3] == static_cast<int>(sc.val[3]);
-    }
+    template <typename T, typename S, typename D>
+    void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 }
 
 void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
-        {multiply_gpu<unsigned char, unsigned char>       , 0 /*multiply_gpu<unsigned char, signed char>*/ , multiply_gpu<unsigned char, unsigned short>    , multiply_gpu<unsigned char, short>       , multiply_gpu<unsigned char, int>    , multiply_gpu<unsigned char, float>    , multiply_gpu<unsigned char, double>    },
-        {0 /*multiply_gpu<signed char, unsigned char>*/   , 0 /*multiply_gpu<signed char, signed char>*/   , 0 /*multiply_gpu<signed char, unsigned short>*/, 0 /*multiply_gpu<signed char, short>*/   , 0 /*multiply_gpu<signed char, int>*/, 0 /*multiply_gpu<signed char, float>*/, 0 /*multiply_gpu<signed char, double>*/},
-        {0 /*multiply_gpu<unsigned short, unsigned char>*/, 0 /*multiply_gpu<unsigned short, signed char>*/, multiply_gpu<unsigned short, unsigned short>   , 0 /*multiply_gpu<unsigned short, short>*/, multiply_gpu<unsigned short, int>   , multiply_gpu<unsigned short, float>   , multiply_gpu<unsigned short, double>   },
-        {0 /*multiply_gpu<short, unsigned char>*/         , 0 /*multiply_gpu<short, signed char>*/         , 0 /*multiply_gpu<short, unsigned short>*/      , multiply_gpu<short, short>               , multiply_gpu<short, int>            , multiply_gpu<short, float>            , multiply_gpu<short, double>            },
-        {0 /*multiply_gpu<int, unsigned char>*/           , 0 /*multiply_gpu<int, signed char>*/           , 0 /*multiply_gpu<int, unsigned short>*/        , 0 /*multiply_gpu<int, short>*/           , multiply_gpu<int, int>              , multiply_gpu<int, float>              , multiply_gpu<int, double>              },
-        {0 /*multiply_gpu<float, unsigned char>*/         , 0 /*multiply_gpu<float, signed char>*/         , 0 /*multiply_gpu<float, unsigned short>*/      , 0 /*multiply_gpu<float, short>*/         , 0 /*multiply_gpu<float, int>*/      , multiply_gpu<float, float>            , multiply_gpu<float, double>            },
-        {0 /*multiply_gpu<double, unsigned char>*/        , 0 /*multiply_gpu<double, signed char>*/        , 0 /*multiply_gpu<double, unsigned short>*/     , 0 /*multiply_gpu<double, short>*/        , 0 /*multiply_gpu<double, int>*/     , 0 /*multiply_gpu<double, float>*/     , multiply_gpu<double, double>           }
+        {
+            mulScalar<unsigned char, float, unsigned char>,
+            mulScalar<unsigned char, float, signed char>,
+            mulScalar<unsigned char, float, unsigned short>,
+            mulScalar<unsigned char, float, short>,
+            mulScalar<unsigned char, float, int>,
+            mulScalar<unsigned char, float, float>,
+            mulScalar<unsigned char, double, double>
+        },
+        {
+            mulScalar<signed char, float, unsigned char>,
+            mulScalar<signed char, float, signed char>,
+            mulScalar<signed char, float, unsigned short>,
+            mulScalar<signed char, float, short>,
+            mulScalar<signed char, float, int>,
+            mulScalar<signed char, float, float>,
+            mulScalar<signed char, double, double>
+        },
+        {
+            0 /*mulScalar<unsigned short, float, unsigned char>*/,
+            0 /*mulScalar<unsigned short, float, signed char>*/,
+            mulScalar<unsigned short, float, unsigned short>,
+            mulScalar<unsigned short, float, short>,
+            mulScalar<unsigned short, float, int>,
+            mulScalar<unsigned short, float, float>,
+            mulScalar<unsigned short, double, double>
+        },
+        {
+            0 /*mulScalar<short, float, unsigned char>*/,
+            0 /*mulScalar<short, float, signed char>*/,
+            mulScalar<short, float, unsigned short>,
+            mulScalar<short, float, short>,
+            mulScalar<short, float, int>,
+            mulScalar<short, float, float>,
+            mulScalar<short, double, double>
+        },
+        {
+            0 /*mulScalar<int, float, unsigned char>*/,
+            0 /*mulScalar<int, float, signed char>*/,
+            0 /*mulScalar<int, float, unsigned short>*/,
+            0 /*mulScalar<int, float, short>*/,
+            mulScalar<int, float, int>,
+            mulScalar<int, float, float>,
+            mulScalar<int, double, double>
+        },
+        {
+            0 /*mulScalar<float, float, unsigned char>*/,
+            0 /*mulScalar<float, float, signed char>*/,
+            0 /*mulScalar<float, float, unsigned short>*/,
+            0 /*mulScalar<float, float, short>*/,
+            0 /*mulScalar<float, float, int>*/,
+            mulScalar<float, float, float>,
+            mulScalar<float, double, double>
+        },
+        {
+            0 /*mulScalar<double, double, unsigned char>*/,
+            0 /*mulScalar<double, double, signed char>*/,
+            0 /*mulScalar<double, double, unsigned short>*/,
+            0 /*mulScalar<double, double, short>*/,
+            0 /*mulScalar<double, double, int>*/,
+            0 /*mulScalar<double, double, float>*/,
+            mulScalar<double, double, double>
+        }
     };
 
     typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
@@ -716,148 +1173,254 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
     if (dtype < 0)
         dtype = src.depth();
 
-    CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-    CV_Assert(src.channels() <= 4);
+    const int sdepth = src.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int cn = src.channels();
 
-    if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( cn <= 4 );
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    if (dst.type() == src.type() && scale == 1 && (src.depth() == CV_32F || isIntScalar(sc)))
-    {
-        const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1];
+    const Scalar nsc(sc.val[0] * scale, sc.val[1] * scale, sc.val[2] * scale, sc.val[3] * scale);
 
-        if (npp_func)
-        {
-            npp_func(src, sc, dst, stream);
-            return;
-        }
+    const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
+    if (ddepth == sdepth && cn > 1 && npp_func != 0)
+    {
+        npp_func(src, nsc, dst, stream);
+        return;
     }
 
-    CV_Assert(src.channels() == 1);
+    CV_Assert( cn == 1 );
 
-    const func_t func = funcs[src.depth()][dst.depth()];
+    const func_t func = funcs[sdepth][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(src, sc.val[0], dst, scale, stream);
+    func(src, nsc.val[0], dst, stream);
 }
 
 ////////////////////////////////////////////////////////////////////////
 // divide
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
-    void divide_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream);
-    void divide_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream);
+    void divMat_8uc4_32f(PtrStepSz<unsigned int> src1, PtrStepSzf src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
 
-    template <typename T, typename D>
-    void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    void divMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream);
 
-    template <typename T, typename D>
-    void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-}}}
+    template <typename T, typename S, typename D>
+    void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+}
 
 void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
     if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1)
     {
-        CV_Assert(src1.size() == src2.size());
+        CV_Assert( src1.size() == src2.size() );
 
         dst.create(src1.size(), src1.type());
 
-        divide_gpu(static_cast<PtrStepSz<uchar4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<uchar4> >(dst), stream);
+        divMat_8uc4_32f(src1, src2, dst, stream);
     }
     else if (src1.type() == CV_16SC4 && src2.type() == CV_32FC1)
     {
-        CV_Assert(src1.size() == src2.size());
+        CV_Assert( src1.size() == src2.size() );
 
         dst.create(src1.size(), src1.type());
 
-        divide_gpu(static_cast<PtrStepSz<short4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<short4> >(dst), stream);
+        divMat_16sc4_32f(src1, src2, dst, stream);
     }
     else
     {
-        typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+        typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
         static const func_t funcs[7][7] =
         {
-            {divide_gpu<unsigned char, unsigned char>       , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short>    , divide_gpu<unsigned char, short>       , divide_gpu<unsigned char, int>    , divide_gpu<unsigned char, float>    , divide_gpu<unsigned char, double>    },
-            {0 /*divide_gpu<signed char, unsigned char>*/   , 0 /*divide_gpu<signed char, signed char>*/   , 0 /*divide_gpu<signed char, unsigned short>*/, 0 /*divide_gpu<signed char, short>*/   , 0 /*divide_gpu<signed char, int>*/, 0 /*divide_gpu<signed char, float>*/, 0 /*divide_gpu<signed char, double>*/},
-            {0 /*divide_gpu<unsigned short, unsigned char>*/, 0 /*divide_gpu<unsigned short, signed char>*/, divide_gpu<unsigned short, unsigned short>   , 0 /*divide_gpu<unsigned short, short>*/, divide_gpu<unsigned short, int>   , divide_gpu<unsigned short, float>   , divide_gpu<unsigned short, double>   },
-            {0 /*divide_gpu<short, unsigned char>*/         , 0 /*divide_gpu<short, signed char>*/         , 0 /*divide_gpu<short, unsigned short>*/      , divide_gpu<short, short>               , divide_gpu<short, int>            , divide_gpu<short, float>            , divide_gpu<short, double>            },
-            {0 /*divide_gpu<int, unsigned char>*/           , 0 /*divide_gpu<int, signed char>*/           , 0 /*divide_gpu<int, unsigned short>*/        , 0 /*divide_gpu<int, short>*/           , divide_gpu<int, int>              , divide_gpu<int, float>              , divide_gpu<int, double>              },
-            {0 /*divide_gpu<float, unsigned char>*/         , 0 /*divide_gpu<float, signed char>*/         , 0 /*divide_gpu<float, unsigned short>*/      , 0 /*divide_gpu<float, short>*/         , 0 /*divide_gpu<float, int>*/      , divide_gpu<float, float>            , divide_gpu<float, double>            },
-            {0 /*divide_gpu<double, unsigned char>*/        , 0 /*divide_gpu<double, signed char>*/        , 0 /*divide_gpu<double, unsigned short>*/     , 0 /*divide_gpu<double, short>*/        , 0 /*divide_gpu<double, int>*/     , 0 /*divide_gpu<double, float>*/     , divide_gpu<double, double>           }
-        };
-
-        typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
-        static const npp_func_t npp_funcs[6] =
-        {
-            NppArithm<CV_8U , nppiDiv_8u_C1RSfs >::call,
-            0,
-            NppArithm<CV_16U, nppiDiv_16u_C1RSfs>::call,
-            NppArithm<CV_16S, nppiDiv_16s_C1RSfs>::call,
-            NppArithm<CV_32S, nppiDiv_32s_C1RSfs>::call,
-            NppArithm<CV_32F, nppiDiv_32f_C1R   >::call
+            {
+                divMat<unsigned char, float, unsigned char>,
+                divMat<unsigned char, float, signed char>,
+                divMat<unsigned char, float, unsigned short>,
+                divMat<unsigned char, float, short>,
+                divMat<unsigned char, float, int>,
+                divMat<unsigned char, float, float>,
+                divMat<unsigned char, double, double>
+            },
+            {
+                divMat<signed char, float, unsigned char>,
+                divMat<signed char, float, signed char>,
+                divMat<signed char, float, unsigned short>,
+                divMat<signed char, float, short>,
+                divMat<signed char, float, int>,
+                divMat<signed char, float, float>,
+                divMat<signed char, double, double>
+            },
+            {
+                0 /*divMat<unsigned short, float, unsigned char>*/,
+                0 /*divMat<unsigned short, float, signed char>*/,
+                divMat<unsigned short, float, unsigned short>,
+                divMat<unsigned short, float, short>,
+                divMat<unsigned short, float, int>,
+                divMat<unsigned short, float, float>,
+                divMat<unsigned short, double, double>
+            },
+            {
+                0 /*divMat<short, float, unsigned char>*/,
+                0 /*divMat<short, float, signed char>*/,
+                divMat<short, float, unsigned short>,
+                divMat<short, float, short>,
+                divMat<short, float, int>,
+                divMat<short, float, float>,
+                divMat<short, double, double>
+            },
+            {
+                0 /*divMat<int, float, unsigned char>*/,
+                0 /*divMat<int, float, signed char>*/,
+                0 /*divMat<int, float, unsigned short>*/,
+                0 /*divMat<int, float, short>*/,
+                divMat<int, float, int>,
+                divMat<int, float, float>,
+                divMat<int, double, double>
+            },
+            {
+                0 /*divMat<float, float, unsigned char>*/,
+                0 /*divMat<float, float, signed char>*/,
+                0 /*divMat<float, float, unsigned short>*/,
+                0 /*divMat<float, float, short>*/,
+                0 /*divMat<float, float, int>*/,
+                divMat<float, float, float>,
+                divMat<float, double, double>
+            },
+            {
+                0 /*divMat<double, double, unsigned char>*/,
+                0 /*divMat<double, double, signed char>*/,
+                0 /*divMat<double, double, unsigned short>*/,
+                0 /*divMat<double, double, short>*/,
+                0 /*divMat<double, double, int>*/,
+                0 /*divMat<double, double, float>*/,
+                divMat<double, double, double>
+            }
         };
 
         if (dtype < 0)
             dtype = src1.depth();
 
-        CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-        CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());
+        const int sdepth = src1.depth();
+        const int ddepth = CV_MAT_DEPTH(dtype);
+        const int cn = src1.channels();
 
-        if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+        CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+        CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+
+        if (sdepth == CV_64F || ddepth == CV_64F)
         {
-            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            if (!deviceSupports(NATIVE_DOUBLE))
                 CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
         }
 
-        dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));
+        dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
 
-        if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F)
-        {
-            npp_funcs[src1.depth()](src2.reshape(1), src1.reshape(1), dst.reshape(1), stream);
-            return;
-        }
+        PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+        PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+        PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-        const func_t func = funcs[src1.depth()][dst.depth()];
+        const func_t func = funcs[sdepth][ddepth];
 
         if (!func)
             CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-        func(src1.reshape(1), src2.reshape(1), dst.reshape(1), scale, stream);
+        func(src1_, src2_, dst_, scale, stream);
     }
 }
 
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+}
+
 void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
-        {divide_gpu<unsigned char, unsigned char>       , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short>    , divide_gpu<unsigned char, short>       , divide_gpu<unsigned char, int>    , divide_gpu<unsigned char, float>    , divide_gpu<unsigned char, double>    },
-        {0 /*divide_gpu<signed char, unsigned char>*/   , 0 /*divide_gpu<signed char, signed char>*/   , 0 /*divide_gpu<signed char, unsigned short>*/, 0 /*divide_gpu<signed char, short>*/   , 0 /*divide_gpu<signed char, int>*/, 0 /*divide_gpu<signed char, float>*/, 0 /*divide_gpu<signed char, double>*/},
-        {0 /*divide_gpu<unsigned short, unsigned char>*/, 0 /*divide_gpu<unsigned short, signed char>*/, divide_gpu<unsigned short, unsigned short>   , 0 /*divide_gpu<unsigned short, short>*/, divide_gpu<unsigned short, int>   , divide_gpu<unsigned short, float>   , divide_gpu<unsigned short, double>   },
-        {0 /*divide_gpu<short, unsigned char>*/         , 0 /*divide_gpu<short, signed char>*/         , 0 /*divide_gpu<short, unsigned short>*/      , divide_gpu<short, short>               , divide_gpu<short, int>            , divide_gpu<short, float>            , divide_gpu<short, double>            },
-        {0 /*divide_gpu<int, unsigned char>*/           , 0 /*divide_gpu<int, signed char>*/           , 0 /*divide_gpu<int, unsigned short>*/        , 0 /*divide_gpu<int, short>*/           , divide_gpu<int, int>              , divide_gpu<int, float>              , divide_gpu<int, double>              },
-        {0 /*divide_gpu<float, unsigned char>*/         , 0 /*divide_gpu<float, signed char>*/         , 0 /*divide_gpu<float, unsigned short>*/      , 0 /*divide_gpu<float, short>*/         , 0 /*divide_gpu<float, int>*/      , divide_gpu<float, float>            , divide_gpu<float, double>            },
-        {0 /*divide_gpu<double, unsigned char>*/        , 0 /*divide_gpu<double, signed char>*/        , 0 /*divide_gpu<double, unsigned short>*/     , 0 /*divide_gpu<double, short>*/        , 0 /*divide_gpu<double, int>*/     , 0 /*divide_gpu<double, float>*/     , divide_gpu<double, double>           }
+        {
+            divScalar<unsigned char, float, unsigned char>,
+            divScalar<unsigned char, float, signed char>,
+            divScalar<unsigned char, float, unsigned short>,
+            divScalar<unsigned char, float, short>,
+            divScalar<unsigned char, float, int>,
+            divScalar<unsigned char, float, float>,
+            divScalar<unsigned char, double, double>
+        },
+        {
+            divScalar<signed char, float, unsigned char>,
+            divScalar<signed char, float, signed char>,
+            divScalar<signed char, float, unsigned short>,
+            divScalar<signed char, float, short>,
+            divScalar<signed char, float, int>,
+            divScalar<signed char, float, float>,
+            divScalar<signed char, double, double>
+        },
+        {
+            0 /*divScalar<unsigned short, float, unsigned char>*/,
+            0 /*divScalar<unsigned short, float, signed char>*/,
+            divScalar<unsigned short, float, unsigned short>,
+            divScalar<unsigned short, float, short>,
+            divScalar<unsigned short, float, int>,
+            divScalar<unsigned short, float, float>,
+            divScalar<unsigned short, double, double>
+        },
+        {
+            0 /*divScalar<short, float, unsigned char>*/,
+            0 /*divScalar<short, float, signed char>*/,
+            divScalar<short, float, unsigned short>,
+            divScalar<short, float, short>,
+            divScalar<short, float, int>,
+            divScalar<short, float, float>,
+            divScalar<short, double, double>
+        },
+        {
+            0 /*divScalar<int, float, unsigned char>*/,
+            0 /*divScalar<int, float, signed char>*/,
+            0 /*divScalar<int, float, unsigned short>*/,
+            0 /*divScalar<int, float, short>*/,
+            divScalar<int, float, int>,
+            divScalar<int, float, float>,
+            divScalar<int, double, double>
+        },
+        {
+            0 /*divScalar<float, float, unsigned char>*/,
+            0 /*divScalar<float, float, signed char>*/,
+            0 /*divScalar<float, float, unsigned short>*/,
+            0 /*divScalar<float, float, short>*/,
+            0 /*divScalar<float, float, int>*/,
+            divScalar<float, float, float>,
+            divScalar<float, double, double>
+        },
+        {
+            0 /*divScalar<double, double, unsigned char>*/,
+            0 /*divScalar<double, double, signed char>*/,
+            0 /*divScalar<double, double, unsigned short>*/,
+            0 /*divScalar<double, double, short>*/,
+            0 /*divScalar<double, double, int>*/,
+            0 /*divScalar<double, double, float>*/,
+            divScalar<double, double, double>
+        }
     };
 
     typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
@@ -875,536 +1438,547 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
     if (dtype < 0)
         dtype = src.depth();
 
-    CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-    CV_Assert(src.channels() <= 4);
+    const int sdepth = src.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int cn = src.channels();
 
-    if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( cn <= 4 );
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    if (dst.type() == src.type() && scale == 1 && (src.depth() == CV_32F || isIntScalar(sc)))
-    {
-        const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1];
+    const Scalar nsc(sc.val[0] / scale, sc.val[1] / scale, sc.val[2] / scale, sc.val[3] / scale);
 
-        if (npp_func)
-        {
-            npp_func(src, sc, dst, stream);
-            return;
-        }
+    const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
+    if (ddepth == sdepth && cn > 1 && npp_func != 0)
+    {
+        npp_func(src, nsc, dst, stream);
+        return;
     }
 
-    CV_Assert(src.channels() == 1);
+    CV_Assert( cn == 1 );
 
-    const func_t func = funcs[src.depth()][dst.depth()];
+    const func_t func = funcs[sdepth][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(src, sc.val[0], dst, scale, stream);
+    func(src, nsc.val[0], dst, stream);
+}
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 }
 
 void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
-        {divide_gpu<unsigned char, unsigned char>       , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short>    , divide_gpu<unsigned char, short>       , divide_gpu<unsigned char, int>    , divide_gpu<unsigned char, float>    , divide_gpu<unsigned char, double>    },
-        {0 /*divide_gpu<signed char, unsigned char>*/   , 0 /*divide_gpu<signed char, signed char>*/   , 0 /*divide_gpu<signed char, unsigned short>*/, 0 /*divide_gpu<signed char, short>*/   , 0 /*divide_gpu<signed char, int>*/, 0 /*divide_gpu<signed char, float>*/, 0 /*divide_gpu<signed char, double>*/},
-        {0 /*divide_gpu<unsigned short, unsigned char>*/, 0 /*divide_gpu<unsigned short, signed char>*/, divide_gpu<unsigned short, unsigned short>   , 0 /*divide_gpu<unsigned short, short>*/, divide_gpu<unsigned short, int>   , divide_gpu<unsigned short, float>   , divide_gpu<unsigned short, double>   },
-        {0 /*divide_gpu<short, unsigned char>*/         , 0 /*divide_gpu<short, signed char>*/         , 0 /*divide_gpu<short, unsigned short>*/      , divide_gpu<short, short>               , divide_gpu<short, int>            , divide_gpu<short, float>            , divide_gpu<short, double>            },
-        {0 /*divide_gpu<int, unsigned char>*/           , 0 /*divide_gpu<int, signed char>*/           , 0 /*divide_gpu<int, unsigned short>*/        , 0 /*divide_gpu<int, short>*/           , divide_gpu<int, int>              , divide_gpu<int, float>              , divide_gpu<int, double>              },
-        {0 /*divide_gpu<float, unsigned char>*/         , 0 /*divide_gpu<float, signed char>*/         , 0 /*divide_gpu<float, unsigned short>*/      , 0 /*divide_gpu<float, short>*/         , 0 /*divide_gpu<float, int>*/      , divide_gpu<float, float>            , divide_gpu<float, double>            },
-        {0 /*divide_gpu<double, unsigned char>*/        , 0 /*divide_gpu<double, signed char>*/        , 0 /*divide_gpu<double, unsigned short>*/     , 0 /*divide_gpu<double, short>*/        , 0 /*divide_gpu<double, int>*/     , 0 /*divide_gpu<double, float>*/     , divide_gpu<double, double>           }
+        {
+            divInv<unsigned char, float, unsigned char>,
+            divInv<unsigned char, float, signed char>,
+            divInv<unsigned char, float, unsigned short>,
+            divInv<unsigned char, float, short>,
+            divInv<unsigned char, float, int>,
+            divInv<unsigned char, float, float>,
+            divInv<unsigned char, double, double>
+        },
+        {
+            divInv<signed char, float, unsigned char>,
+            divInv<signed char, float, signed char>,
+            divInv<signed char, float, unsigned short>,
+            divInv<signed char, float, short>,
+            divInv<signed char, float, int>,
+            divInv<signed char, float, float>,
+            divInv<signed char, double, double>
+        },
+        {
+            0 /*divInv<unsigned short, float, unsigned char>*/,
+            0 /*divInv<unsigned short, float, signed char>*/,
+            divInv<unsigned short, float, unsigned short>,
+            divInv<unsigned short, float, short>,
+            divInv<unsigned short, float, int>,
+            divInv<unsigned short, float, float>,
+            divInv<unsigned short, double, double>
+        },
+        {
+            0 /*divInv<short, float, unsigned char>*/,
+            0 /*divInv<short, float, signed char>*/,
+            divInv<short, float, unsigned short>,
+            divInv<short, float, short>,
+            divInv<short, float, int>,
+            divInv<short, float, float>,
+            divInv<short, double, double>
+        },
+        {
+            0 /*divInv<int, float, unsigned char>*/,
+            0 /*divInv<int, float, signed char>*/,
+            0 /*divInv<int, float, unsigned short>*/,
+            0 /*divInv<int, float, short>*/,
+            divInv<int, float, int>,
+            divInv<int, float, float>,
+            divInv<int, double, double>
+        },
+        {
+            0 /*divInv<float, float, unsigned char>*/,
+            0 /*divInv<float, float, signed char>*/,
+            0 /*divInv<float, float, unsigned short>*/,
+            0 /*divInv<float, float, short>*/,
+            0 /*divInv<float, float, int>*/,
+            divInv<float, float, float>,
+            divInv<float, double, double>
+        },
+        {
+            0 /*divInv<double, double, unsigned char>*/,
+            0 /*divInv<double, double, signed char>*/,
+            0 /*divInv<double, double, unsigned short>*/,
+            0 /*divInv<double, double, short>*/,
+            0 /*divInv<double, double, int>*/,
+            0 /*divInv<double, double, float>*/,
+            divInv<double, double, double>
+        }
     };
 
     if (dtype < 0)
         dtype = src.depth();
 
-    CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-    CV_Assert(src.channels() == 1);
+    const int sdepth = src.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int cn = src.channels();
 
-    if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( cn == 1 );
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    const func_t func = funcs[src.depth()][dst.depth()];
+    const func_t func = funcs[sdepth][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(scale, src, dst, stream);
+    func(src, scale, dst, stream);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // absdiff
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
     template <typename T>
-    void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
     template <typename T>
-    void absdiff_gpu(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-}}}
+    void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-namespace
-{
-    template <int DEPTH> struct NppAbsDiffFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* src1, int src1_step, const npp_t* src2, int src2_step, npp_t* dst, int dst_step, NppiSize sz);
-    };
-
-    template <int DEPTH, typename NppAbsDiffFunc<DEPTH>::func_t func> struct NppAbsDiff
-    {
-        typedef typename NppAbsDiffFunc<DEPTH>::npp_t npp_t;
-
-        static void call(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize sz;
-            sz.width  = src1.cols;
-            sz.height = src1.rows;
-
-            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step),
-                              (npp_t*)dst.data, static_cast<int>(dst.step), sz) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template <int DEPTH> struct NppAbsDiffCFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-        typedef npp_t scalar_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, npp_t* pDst,  int nDstStep,  NppiSize oSizeROI, npp_t nConstant);
-    };
-    template <> struct NppAbsDiffCFunc<CV_16U>
-    {
-        typedef NppTypeTraits<CV_16U>::npp_t npp_t;
-        typedef Npp32u scalar_t;
-
-#if (CUDA_VERSION <= 4020)
-        typedef NppStatus (*func_t)(const Npp16u* pSrc1, int nSrc1Step, Npp16u* pDst, int nDstStep, NppiSize oSizeROI, Npp32u nConstant);
-#else
-        typedef NppStatus (*func_t)(const Npp16u * pSrc1, int nSrc1Step, Npp16u * pDst,  int nDstStep,  NppiSize oSizeROI, Npp16u nConstant);
-#endif
-    };
-
-    template <int DEPTH, typename NppAbsDiffCFunc<DEPTH>::func_t func> struct NppAbsDiffC
-    {
-        typedef typename NppAbsDiffCFunc<DEPTH>::npp_t npp_t;
-        typedef typename NppAbsDiffCFunc<DEPTH>::scalar_t scalar_t;
-
-        static void call(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize sz;
-            sz.width  = src1.cols;
-            sz.height = src1.rows;
-
-            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step),
-                              (npp_t*)dst.data, static_cast<int>(dst.step), sz, static_cast<scalar_t>(val)) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
+    template <typename T>
+    void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        NppAbsDiff<CV_8U, nppiAbsDiff_8u_C1R>::call,
-        absdiff_gpu<signed char>,
-        NppAbsDiff<CV_16U, nppiAbsDiff_16u_C1R>::call,
-        absdiff_gpu<short>,
-        absdiff_gpu<int>,
-        NppAbsDiff<CV_32F, nppiAbsDiff_32f_C1R>::call,
-        absdiff_gpu<double>
+        absDiffMat<unsigned char>,
+        absDiffMat<signed char>,
+        absDiffMat<unsigned short>,
+        absDiffMat<short>,
+        absDiffMat<int>,
+        absDiffMat<float>,
+        absDiffMat<double>
+    };
+    static const func_t vfuncs4[] =
+    {
+        vabsDiff4<unsigned int>,
+        vabsDiff4<int>,
+        0,
+        0
+    };
+    static const func_t vfuncs2[] =
+    {
+        0,
+        0,
+        vabsDiff2<unsigned int>,
+        vabsDiff2<int>
     };
 
-    CV_Assert(src1.depth() <= CV_64F);
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    const int depth = src1.depth();
+    const int cn = src1.channels();
 
-    if (src1.depth() == CV_64F)
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+
+    if (depth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
     dst.create(src1.size(), src1.type());
 
-    funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+
+    if (depth < CV_32S)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        {
+            const func_t vfunc4 = vfuncs4[depth];
+            const func_t vfunc2 = vfuncs2[depth];
+
+            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            {
+                const int vcols = src1_.cols >> 2;
+
+                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+
+            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            {
+                const int vcols = src1_.cols >> 1;
+
+                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+        }
+    }
+
+    const func_t func = funcs[depth];
+
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, stream);
+}
+
+namespace arithm
+{
+    template <typename T, typename S>
+    void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 }
 
 void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& stream)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        NppAbsDiffC<CV_8U, nppiAbsDiffC_8u_C1R>::call,
-        absdiff_gpu<signed char>,
-        NppAbsDiffC<CV_16U, nppiAbsDiffC_16u_C1R>::call,
-        absdiff_gpu<short>,
-        absdiff_gpu<int>,
-        NppAbsDiffC<CV_32F, nppiAbsDiffC_32f_C1R>::call,
-        absdiff_gpu<double>
+        absDiffScalar<unsigned char, float>,
+        absDiffScalar<signed char, float>,
+        absDiffScalar<unsigned short, float>,
+        absDiffScalar<short, float>,
+        absDiffScalar<int, float>,
+        absDiffScalar<float, float>,
+        absDiffScalar<double, double>
     };
 
-    CV_Assert(src1.depth() <= CV_64F);
-    CV_Assert(src1.channels() == 1);
+    const int depth = src1.depth();
 
-    if (src1.depth() == CV_64F)
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src1.channels() == 1 );
+
+    if (depth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
     dst.create(src1.size(), src1.type());
 
-    funcs[src1.depth()](src1, src2.val[0], dst, StreamAccessor::getStream(stream));
+    funcs[depth](src1, src2.val[0], dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // abs
 
-void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& s)
+namespace arithm
 {
-    CV_Assert(src.depth() == CV_16S || src.depth() == CV_32F);
+    template <typename T>
+    void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream)
+{
+    using namespace arithm;
+
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    static const func_t funcs[] =
+    {
+        absMat<unsigned char>,
+        absMat<signed char>,
+        absMat<unsigned short>,
+        absMat<short>,
+        absMat<int>,
+        absMat<float>,
+        absMat<double>
+    };
+
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
 
     dst.create(src.size(), src.type());
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStreamHandler h(stream);
-
-    NppiSize oSizeROI;
-    oSizeROI.width = src.cols * src.channels();
-    oSizeROI.height = src.rows;
-
-    bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
-
-    if (src.depth() == CV_16S)
-    {
-        if (aligned && oSizeROI.width % 4 == 0)
-        {
-            oSizeROI.width /= 4;
-            nppSafeCall( nppiAbs_16s_C4R(src.ptr<Npp16s>(), static_cast<int>(src.step), dst.ptr<Npp16s>(), static_cast<int>(dst.step), oSizeROI) );
-        }
-        else
-        {
-            nppSafeCall( nppiAbs_16s_C1R(src.ptr<Npp16s>(), static_cast<int>(src.step), dst.ptr<Npp16s>(), static_cast<int>(dst.step), oSizeROI) );
-        }
-    }
-    else
-    {
-        if (aligned && oSizeROI.width % 4 == 0)
-        {
-            oSizeROI.width /= 4;
-            nppSafeCall( nppiAbs_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), oSizeROI) );
-        }
-        else
-        {
-            nppSafeCall( nppiAbs_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), oSizeROI) );
-        }
-    }
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+    funcs[depth](src, dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // sqr
 
-namespace
+namespace arithm
 {
-    template <int DEPTH> struct NppSqrFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
-    };
-    template <> struct NppSqrFunc<CV_32F>
-    {
-        typedef NppTypeTraits<CV_32F>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI);
-    };
-
-    template <int DEPTH, typename NppSqrFunc<DEPTH>::func_t func, typename NppSqrFunc<DEPTH>::func_t func_c4> struct NppSqr
-    {
-        typedef typename NppSqrFunc<DEPTH>::npp_t npp_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize oSizeROI;
-            oSizeROI.width = src.cols * src.channels();
-            oSizeROI.height = src.rows;
-
-            bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
-
-            if (aligned && oSizeROI.width % 4 == 0)
-            {
-                oSizeROI.width /= 4;
-                nppSafeCall( func_c4(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, 0) );
-            }
-            else
-            {
-                nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, 0) );
-            }
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template <typename NppSqrFunc<CV_32F>::func_t func, typename NppSqrFunc<CV_32F>::func_t func_c4> struct NppSqr<CV_32F, func, func_c4>
-    {
-        typedef NppSqrFunc<CV_32F>::npp_t npp_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize oSizeROI;
-            oSizeROI.width = src.cols * src.channels();
-            oSizeROI.height = src.rows;
-
-            bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
-
-            if (aligned && oSizeROI.width % 4 == 0)
-            {
-                oSizeROI.width /= 4;
-                nppSafeCall( func_c4(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
-            }
-            else
-            {
-                nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
-            }
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
+    template <typename T>
+    void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 }
 
 void cv::gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
+    using namespace arithm;
 
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        NppSqr<CV_8U, nppiSqr_8u_C1RSfs, nppiSqr_8u_C4RSfs>::call,
-        0,
-        NppSqr<CV_16U, nppiSqr_16u_C1RSfs, nppiSqr_16u_C4RSfs>::call,
-        NppSqr<CV_16S, nppiSqr_16s_C1RSfs, nppiSqr_16s_C4RSfs>::call,
-        0,
-        NppSqr<CV_32F, nppiSqr_32f_C1R, nppiSqr_32f_C4R>::call
+        sqrMat<unsigned char>,
+        sqrMat<signed char>,
+        sqrMat<unsigned short>,
+        sqrMat<short>,
+        sqrMat<int>,
+        sqrMat<float>,
+        sqrMat<double>
     };
 
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F);
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    funcs[depth](src, dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // sqrt
 
-namespace
+namespace arithm
 {
-    template <int DEPTH> struct NppOneSourceFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
-    };
-    template <> struct NppOneSourceFunc<CV_32F>
-    {
-        typedef NppTypeTraits<CV_32F>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI);
-    };
-
-    template <int DEPTH, typename NppOneSourceFunc<DEPTH>::func_t func> struct NppOneSource
-    {
-        typedef typename NppOneSourceFunc<DEPTH>::npp_t npp_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize oSizeROI;
-            oSizeROI.width = src.cols * src.channels();
-            oSizeROI.height = src.rows;
-
-            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, 0) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template <typename NppOneSourceFunc<CV_32F>::func_t func> struct NppOneSource<CV_32F, func>
-    {
-        typedef NppOneSourceFunc<CV_32F>::npp_t npp_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize oSizeROI;
-            oSizeROI.width = src.cols * src.channels();
-            oSizeROI.height = src.rows;
-
-            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
+    template <typename T>
+    void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 }
 
 void cv::gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
+    using namespace arithm;
 
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        NppOneSource<CV_8U, nppiSqrt_8u_C1RSfs>::call,
-        0,
-        NppOneSource<CV_16U, nppiSqrt_16u_C1RSfs>::call,
-        NppOneSource<CV_16S, nppiSqrt_16s_C1RSfs>::call,
-        0,
-        NppOneSource<CV_32F, nppiSqrt_32f_C1R>::call
+        sqrtMat<unsigned char>,
+        sqrtMat<signed char>,
+        sqrtMat<unsigned short>,
+        sqrtMat<short>,
+        sqrtMat<int>,
+        sqrtMat<float>,
+        sqrtMat<double>
     };
 
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F);
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    funcs[depth](src, dst, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
 // log
 
+namespace arithm
+{
+    template <typename T>
+    void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
 void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
+    using namespace arithm;
 
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        NppOneSource<CV_8U, nppiLn_8u_C1RSfs>::call,
-        0,
-        NppOneSource<CV_16U, nppiLn_16u_C1RSfs>::call,
-        NppOneSource<CV_16S, nppiLn_16s_C1RSfs>::call,
-        0,
-        NppOneSource<CV_32F, nppiLn_32f_C1R>::call
+        logMat<unsigned char>,
+        logMat<signed char>,
+        logMat<unsigned short>,
+        logMat<short>,
+        logMat<int>,
+        logMat<float>,
+        logMat<double>
     };
 
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F);
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    funcs[depth](src, dst, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
 // exp
 
+namespace arithm
+{
+    template <typename T>
+    void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
 void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
+    using namespace arithm;
 
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        NppOneSource<CV_8U, nppiExp_8u_C1RSfs>::call,
-        0,
-        NppOneSource<CV_16U, nppiExp_16u_C1RSfs>::call,
-        NppOneSource<CV_16S, nppiExp_16s_C1RSfs>::call,
-        0,
-        NppOneSource<CV_32F, nppiExp_32f_C1R>::call
+        expMat<unsigned char>,
+        expMat<signed char>,
+        expMat<unsigned short>,
+        expMat<short>,
+        expMat<int>,
+        expMat<float>,
+        expMat<double>
     };
 
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F);
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    funcs[depth](src, dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
-// Comparison of two matrixes
+// compare
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
-    template <typename T> void compare_eq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_ne(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_lt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_le(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+}
 
-    template <typename T> void compare_eq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_ne(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_lt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_le(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_gt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_ge(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-}}}
-
-void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream)
+void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][4] =
     {
-        {compare_eq<unsigned char> , compare_ne<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> },
-        {compare_eq<signed char>   , compare_ne<signed char>   , compare_lt<signed char>   , compare_le<signed char>   },
-        {compare_eq<unsigned short>, compare_ne<unsigned short>, compare_lt<unsigned short>, compare_le<unsigned short>},
-        {compare_eq<short>         , compare_ne<short>         , compare_lt<short>         , compare_le<short>         },
-        {compare_eq<int>           , compare_ne<int>           , compare_lt<int>           , compare_le<int>           },
-        {compare_eq<float>         , compare_ne<float>         , compare_lt<float>         , compare_le<float>         },
-        {compare_eq<double>        , compare_ne<double>        , compare_lt<double>        , compare_le<double>        }
+        {cmpMatEq<unsigned char> , cmpMatNe<unsigned char> , cmpMatLt<unsigned char> , cmpMatLe<unsigned char> },
+        {cmpMatEq<signed char>   , cmpMatNe<signed char>   , cmpMatLt<signed char>   , cmpMatLe<signed char>   },
+        {cmpMatEq<unsigned short>, cmpMatNe<unsigned short>, cmpMatLt<unsigned short>, cmpMatLe<unsigned short>},
+        {cmpMatEq<short>         , cmpMatNe<short>         , cmpMatLt<short>         , cmpMatLe<short>         },
+        {cmpMatEq<int>           , cmpMatNe<int>           , cmpMatLt<int>           , cmpMatLe<int>           },
+        {cmpMatEq<float>         , cmpMatNe<float>         , cmpMatLt<float>         , cmpMatLe<float>         },
+        {cmpMatEq<double>        , cmpMatNe<double>        , cmpMatLt<double>        , cmpMatLe<double>        }
     };
 
-    CV_Assert(src1.depth() <= CV_64F);
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-    CV_Assert(cmpop >= CMP_EQ && cmpop <= CMP_NE);
+    const int depth = src1.depth();
+    const int cn = src1.channels();
 
-    if (src1.depth() == CV_64F)
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
+    CV_Assert( cmpop >= CMP_EQ && cmpop <= CMP_NE );
+
+    if (depth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
+    dst.create(src1.size(), CV_MAKE_TYPE(CV_8U, cn));
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
     static const int codes[] =
     {
         0, 2, 3, 2, 3, 1
@@ -1418,15 +1992,29 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
         &src2, &src1, &src1, &src2, &src2, &src2
     };
 
-    dst.create(src1.size(), CV_MAKE_TYPE(CV_8U, src1.channels()));
+    const int code = codes[cmpop];
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, psrc1[cmpop]->data, psrc1[cmpop]->step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, psrc2[cmpop]->data, psrc2[cmpop]->step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-    funcs[src1.depth()][codes[cmpop]](psrc1[cmpop]->reshape(1), psrc2[cmpop]->reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+    const func_t func = funcs[depth][code];
+
+    func(src1_, src2_, dst_, stream);
+}
+
+namespace arithm
+{
+    template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 }
 
 namespace
 {
-    template <typename T>
-    void castScalar(Scalar& sc)
+    template <typename T> void castScalar(Scalar& sc)
     {
         sc.val[0] = saturate_cast<T>(sc.val[0]);
         sc.val[1] = saturate_cast<T>(sc.val[1]);
@@ -1437,18 +2025,18 @@ namespace
 
 void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stream& stream)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][6] =
     {
-        {compare_eq<unsigned char> , compare_gt<unsigned char> , compare_ge<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> , compare_ne<unsigned char> },
-        {compare_eq<signed char>   , compare_gt<signed char>   , compare_ge<signed char>   , compare_lt<signed char>   , compare_le<signed char>   , compare_ne<signed char>   },
-        {compare_eq<unsigned short>, compare_gt<unsigned short>, compare_ge<unsigned short>, compare_lt<unsigned short>, compare_le<unsigned short>, compare_ne<unsigned short>},
-        {compare_eq<short>         , compare_gt<short>         , compare_ge<short>         , compare_lt<short>         , compare_le<short>         , compare_ne<short>         },
-        {compare_eq<int>           , compare_gt<int>           , compare_ge<int>           , compare_lt<int>           , compare_le<int>           , compare_ne<int>           },
-        {compare_eq<float>         , compare_gt<float>         , compare_ge<float>         , compare_lt<float>         , compare_le<float>         , compare_ne<float>         },
-        {compare_eq<double>        , compare_gt<double>        , compare_ge<double>        , compare_lt<double>        , compare_le<double>        , compare_ne<double>        }
+        {cmpScalarEq<unsigned char> , cmpScalarGt<unsigned char> , cmpScalarGe<unsigned char> , cmpScalarLt<unsigned char> , cmpScalarLe<unsigned char> , cmpScalarNe<unsigned char> },
+        {cmpScalarEq<signed char>   , cmpScalarGt<signed char>   , cmpScalarGe<signed char>   , cmpScalarLt<signed char>   , cmpScalarLe<signed char>   , cmpScalarNe<signed char>   },
+        {cmpScalarEq<unsigned short>, cmpScalarGt<unsigned short>, cmpScalarGe<unsigned short>, cmpScalarLt<unsigned short>, cmpScalarLe<unsigned short>, cmpScalarNe<unsigned short>},
+        {cmpScalarEq<short>         , cmpScalarGt<short>         , cmpScalarGe<short>         , cmpScalarLt<short>         , cmpScalarLe<short>         , cmpScalarNe<short>         },
+        {cmpScalarEq<int>           , cmpScalarGt<int>           , cmpScalarGe<int>           , cmpScalarLt<int>           , cmpScalarLe<int>           , cmpScalarNe<int>           },
+        {cmpScalarEq<float>         , cmpScalarGt<float>         , cmpScalarGe<float>         , cmpScalarLt<float>         , cmpScalarLe<float>         , cmpScalarNe<float>         },
+        {cmpScalarEq<double>        , cmpScalarGt<double>        , cmpScalarGe<double>        , cmpScalarLt<double>        , cmpScalarLe<double>        , cmpScalarNe<double>        }
     };
 
     typedef void (*cast_func_t)(Scalar& sc);
@@ -1457,235 +2045,266 @@ void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stre
         castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
     };
 
-    CV_Assert(src.depth() <= CV_64F);
-    CV_Assert(src.channels() <= 4);
-    CV_Assert(cmpop >= CMP_EQ && cmpop <= CMP_NE);
+    const int depth = src.depth();
+    const int cn = src.channels();
 
-    if (src.depth() == CV_64F)
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( cn <= 4 );
+    CV_Assert( cmpop >= CMP_EQ && cmpop <= CMP_NE );
+
+    if (depth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), CV_MAKE_TYPE(CV_8U, src.channels()));
+    dst.create(src.size(), CV_MAKE_TYPE(CV_8U, cn));
 
-    cast_func[src.depth()](sc);
+    cast_func[depth](sc);
 
-    funcs[src.depth()][cmpop](src, src.channels(), sc.val, dst, StreamAccessor::getStream(stream));
+    funcs[depth][cmpop](src, cn, sc.val, dst, StreamAccessor::getStream(stream));
 }
 
-
 //////////////////////////////////////////////////////////////////////////////
 // Unary bitwise logical operations
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
-    void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);
-
-    template <typename T>
-    void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
-}}}
-
-namespace
-{
-    void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
-    {
-        dst.create(src.size(), src.type());
-
-        cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);
-    }
-
-    void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
-    {
-        using namespace cv::gpu::device;
-
-        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static func_t funcs[] =
-        {
-            bitwiseMaskNotCaller<unsigned char>, bitwiseMaskNotCaller<unsigned char>,
-            bitwiseMaskNotCaller<unsigned short>, bitwiseMaskNotCaller<unsigned short>,
-            bitwiseMaskNotCaller<unsigned int>, bitwiseMaskNotCaller<unsigned int>,
-            bitwiseMaskNotCaller<unsigned int>
-        };
-
-        CV_Assert(src.depth() <= CV_64F);
-        CV_Assert(mask.type() == CV_8U && mask.size() == src.size());
-
-        dst.create(src.size(), src.type());
-
-        const func_t func = funcs[src.depth()];
-
-        int cn = src.depth() != CV_64F ? src.channels() : src.channels() * (sizeof(double) / sizeof(unsigned int));
-
-        func(src.rows, src.cols, cn, src, mask, dst, stream);
-    }
+    template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
-void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& stream)
+void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& s)
 {
-    if (mask.empty())
-        bitwiseNotCaller(src, dst, StreamAccessor::getStream(stream));
+    using namespace arithm;
+
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
+    dst.create(src.size(), src.type());
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    const int bcols = (int) (src.cols * src.elemSize());
+
+    if ((bcols & 3) == 0)
+    {
+        const int vcols = bcols >> 2;
+
+        bitMatNot<unsigned int>(
+                    PtrStepSzb(src.rows, vcols, src.data, src.step),
+                    PtrStepSzb(src.rows, vcols, dst.data, dst.step),
+                    mask, stream);
+    }
+    else if ((bcols & 1) == 0)
+    {
+        const int vcols = bcols >> 1;
+
+        bitMatNot<unsigned short>(
+                    PtrStepSzb(src.rows, vcols, src.data, src.step),
+                    PtrStepSzb(src.rows, vcols, dst.data, dst.step),
+                    mask, stream);
+    }
     else
-        bitwiseNotCaller(src, dst, mask, StreamAccessor::getStream(stream));
+    {
+        bitMatNot<unsigned short>(
+                    PtrStepSzb(src.rows, bcols, src.data, src.step),
+                    PtrStepSzb(src.rows, bcols, dst.data, dst.step),
+                    mask, stream);
+    }
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // Binary bitwise logical operations
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
-    void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
+    template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
 
-    template <typename T>
-    void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
-
-    void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
-
-    template <typename T>
-    void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
-
-    void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
-
-    template <typename T>
-    void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
-}}}
-
-namespace
+void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s)
 {
-    void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+    using namespace arithm;
+
+    const int depth = src1.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
+
+    dst.create(src1.size(), src1.type());
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    const int bcols = (int) (src1.cols * src1.elemSize());
+
+    if ((bcols & 3) == 0)
     {
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        const int vcols = bcols >> 2;
 
-        dst.create(src1.size(), src1.type());
-
-        cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
+        bitMatAnd<unsigned int>(
+                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                    mask, stream);
     }
-
-    void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
+    else if ((bcols & 1) == 0)
     {
-        using namespace cv::gpu::device;
+        const int vcols = bcols >> 1;
 
-        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static func_t funcs[] =
-        {
-            bitwiseMaskOrCaller<unsigned char>, bitwiseMaskOrCaller<unsigned char>,
-            bitwiseMaskOrCaller<unsigned short>, bitwiseMaskOrCaller<unsigned short>,
-            bitwiseMaskOrCaller<unsigned int>, bitwiseMaskOrCaller<unsigned int>,
-            bitwiseMaskOrCaller<unsigned int>
-        };
-
-        CV_Assert(src1.depth() <= CV_64F);
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
-
-        dst.create(src1.size(), src1.type());
-
-        const func_t func = funcs[src1.depth()];
-
-        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-
-        func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
+        bitMatAnd<unsigned int>(
+                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                    mask, stream);
     }
-
-
-    void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+    else
     {
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
 
-        dst.create(src1.size(), src1.type());
-
-        cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
-    }
-
-    void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
-    {
-        using namespace cv::gpu::device;
-
-        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static func_t funcs[] =
-        {
-            bitwiseMaskAndCaller<unsigned char>, bitwiseMaskAndCaller<unsigned char>,
-            bitwiseMaskAndCaller<unsigned short>, bitwiseMaskAndCaller<unsigned short>,
-            bitwiseMaskAndCaller<unsigned int>, bitwiseMaskAndCaller<unsigned int>,
-            bitwiseMaskAndCaller<unsigned int>
-        };
-
-        CV_Assert(src1.depth() <= CV_64F);
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
-
-        dst.create(src1.size(), src1.type());
-
-        const func_t func = funcs[src1.depth()];
-
-        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-
-        func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
-    }
-
-
-    void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
-    {
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-
-        dst.create(src1.size(), src1.type());
-
-        cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
-    }
-
-    void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
-    {
-        using namespace cv::gpu::device;
-
-        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static func_t funcs[] =
-        {
-            bitwiseMaskXorCaller<unsigned char>, bitwiseMaskXorCaller<unsigned char>,
-            bitwiseMaskXorCaller<unsigned short>, bitwiseMaskXorCaller<unsigned short>,
-            bitwiseMaskXorCaller<unsigned int>, bitwiseMaskXorCaller<unsigned int>,
-            bitwiseMaskXorCaller<unsigned int>
-        };
-
-        CV_Assert(src1.depth() <= CV_64F);
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
-
-        dst.create(src1.size(), src1.type());
-
-        const func_t func = funcs[src1.depth()];
-
-        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-
-        func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
+        bitMatAnd<unsigned int>(
+                    PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                    mask, stream);
     }
 }
 
-void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
+void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s)
 {
-    if (mask.empty())
-        bitwiseOrCaller(src1, src2, dst, StreamAccessor::getStream(stream));
+    using namespace arithm;
+
+    const int depth = src1.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
+
+    dst.create(src1.size(), src1.type());
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    const int bcols = (int) (src1.cols * src1.elemSize());
+
+    if ((bcols & 3) == 0)
+    {
+        const int vcols = bcols >> 2;
+
+        bitMatOr<unsigned int>(
+                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                    mask, stream);
+    }
+    else if ((bcols & 1) == 0)
+    {
+        const int vcols = bcols >> 1;
+
+        bitMatOr<unsigned int>(
+                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                    mask, stream);
+    }
     else
-        bitwiseOrCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
+    {
+
+        bitMatOr<unsigned int>(
+                    PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                    mask, stream);
+    }
 }
 
-void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
+void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s)
 {
-    if (mask.empty())
-        bitwiseAndCaller(src1, src2, dst, StreamAccessor::getStream(stream));
+    using namespace arithm;
+
+    const int depth = src1.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
+
+    dst.create(src1.size(), src1.type());
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    const int bcols = (int) (src1.cols * src1.elemSize());
+
+    if ((bcols & 3) == 0)
+    {
+        const int vcols = bcols >> 2;
+
+        bitMatXor<unsigned int>(
+                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                    mask, stream);
+    }
+    else if ((bcols & 1) == 0)
+    {
+        const int vcols = bcols >> 1;
+
+        bitMatXor<unsigned int>(
+                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                    mask, stream);
+    }
     else
-        bitwiseAndCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
+    {
+
+        bitMatXor<unsigned int>(
+                    PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                    mask, stream);
+    }
 }
 
-void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
+//////////////////////////////////////////////////////////////////////////////
+// Binary bitwise logical operations with scalars
+
+namespace arithm
 {
-    if (mask.empty())
-        bitwiseXorCaller(src1, src2, dst, StreamAccessor::getStream(stream));
-    else
-        bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
+    template <typename T> void bitScalarAnd(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void bitScalarOr(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void bitScalarXor(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
 }
 
 namespace
 {
+    typedef void (*bit_scalar_func_t)(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <bit_scalar_func_t func> struct BitScalar
+    {
+        static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
+        {
+            func(src, static_cast<unsigned int>(sc.val[0]), dst, stream);
+        }
+    };
+
+    template <bit_scalar_func_t func> struct BitScalar4
+    {
+        static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
+        {
+            Scalar_<unsigned int> isc = sc;
+
+            unsigned int packedVal = 0;
+
+            packedVal |= (isc.val[0] & 0xffff);
+            packedVal |= (isc.val[1] & 0xffff) << 8;
+            packedVal |= (isc.val[2] & 0xffff) << 16;
+            packedVal |= (isc.val[3] & 0xffff) << 24;
+
+            func(src, packedVal, dst, stream);
+        }
+    };
+
     template <int DEPTH, int cn> struct NppBitwiseCFunc
     {
         typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
@@ -1739,64 +2358,79 @@ namespace
     };
 }
 
-void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
-{
-    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[5][4] =
-    {
-        {NppBitwiseC<CV_8U , 1, nppiOrC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiOrC_8u_C4R >::call},
-        {0,0,0,0},
-        {NppBitwiseC<CV_16U, 1, nppiOrC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
-        {0,0,0,0},
-        {NppBitwiseC<CV_32S, 1, nppiOrC_32s_C1R>::call, 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
-    };
-
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S);
-    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
-
-    dst.create(src.size(), src.type());
-
-    funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream));
-}
-
 void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
 {
+    using namespace arithm;
+
     typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
     static const func_t funcs[5][4] =
     {
-        {NppBitwiseC<CV_8U , 1, nppiAndC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiAndC_8u_C4R >::call},
+        {BitScalar< bitScalarAnd<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarAnd<unsigned int> >::call},
         {0,0,0,0},
-        {NppBitwiseC<CV_16U, 1, nppiAndC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+        {BitScalar< bitScalarAnd<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
         {0,0,0,0},
-        {NppBitwiseC<CV_32S, 1, nppiAndC_32s_C1R>::call, 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+        {BitScalar< bitScalarAnd<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
     };
 
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S);
-    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+    const int depth = src.depth();
+    const int cn = src.channels();
+
+    CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
+    CV_Assert( cn == 1 || cn == 3 || cn == 4 );
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream));
+    funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
+{
+    using namespace arithm;
+
+    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[5][4] =
+    {
+        {BitScalar< bitScalarOr<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOr<unsigned int> >::call},
+        {0,0,0,0},
+        {BitScalar< bitScalarOr<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+        {0,0,0,0},
+        {BitScalar< bitScalarOr<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
+    };
+
+    const int depth = src.depth();
+    const int cn = src.channels();
+
+    CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
+    CV_Assert( cn == 1 || cn == 3 || cn == 4 );
+
+    dst.create(src.size(), src.type());
+
+    funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
 {
+    using namespace arithm;
+
     typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
     static const func_t funcs[5][4] =
     {
-        {NppBitwiseC<CV_8U , 1, nppiXorC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiXorC_8u_C4R >::call},
+        {BitScalar< bitScalarXor<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarXor<unsigned int> >::call},
         {0,0,0,0},
-        {NppBitwiseC<CV_16U, 1, nppiXorC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+        {BitScalar< bitScalarXor<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
         {0,0,0,0},
-        {NppBitwiseC<CV_32S, 1, nppiXorC_32s_C1R>::call, 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
+        {BitScalar< bitScalarXor<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
     };
 
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S);
-    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+    const int depth = src.depth();
+    const int cn = src.channels();
+
+    CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
+    CV_Assert( cn == 1 || cn == 3 || cn == 4 );
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream));
+    funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1898,91 +2532,226 @@ void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st
 //////////////////////////////////////////////////////////////////////////////
 // Minimum and maximum operations
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
-    template <typename T> void min_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void max_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 
-    template <typename T> void min_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void max_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream);
-}}}
-
-void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
-{
-    using namespace cv::gpu::device;
-
-    typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[] =
-    {
-        min_gpu<unsigned char>,
-        min_gpu<signed char>,
-        min_gpu<unsigned short>,
-        min_gpu<short>,
-        min_gpu<int>,
-        min_gpu<float>,
-        min_gpu<double>
-    };
-
-    CV_Assert(src1.depth() <= CV_64F);
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-
-    if (src1.depth() == CV_64F)
-    {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src1.size(), src1.type());
-
-    funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+    template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        max_gpu<unsigned char>,
-        max_gpu<signed char>,
-        max_gpu<unsigned short>,
-        max_gpu<short>,
-        max_gpu<int>,
-        max_gpu<float>,
-        max_gpu<double>
+        minMat<unsigned char>,
+        minMat<signed char>,
+        minMat<unsigned short>,
+        minMat<short>,
+        minMat<int>,
+        minMat<float>,
+        minMat<double>
+    };
+    static const func_t vfuncs4[] =
+    {
+        vmin4<unsigned int>,
+        vmin4<int>,
+        0,
+        0
+    };
+    static const func_t vfuncs2[] =
+    {
+        0,
+        0,
+        vmin2<unsigned int>,
+        vmin2<int>
     };
 
-    CV_Assert(src1.depth() <= CV_64F);
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    const int depth = src1.depth();
+    const int cn = src1.channels();
 
-    if (src1.depth() == CV_64F)
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+
+    if (depth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
     dst.create(src1.size(), src1.type());
 
-    funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+
+    if (depth < CV_32S)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        {
+            const func_t vfunc4 = vfuncs4[depth];
+            const func_t vfunc2 = vfuncs2[depth];
+
+            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            {
+                const int vcols = src1_.cols >> 2;
+
+                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+
+            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            {
+                const int vcols = src1_.cols >> 1;
+
+                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+        }
+    }
+
+    const func_t func = funcs[depth];
+
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, stream);
+}
+
+void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
+{
+    using namespace arithm;
+
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    static const func_t funcs[] =
+    {
+        maxMat<unsigned char>,
+        maxMat<signed char>,
+        maxMat<unsigned short>,
+        maxMat<short>,
+        maxMat<int>,
+        maxMat<float>,
+        maxMat<double>
+    };
+    static const func_t vfuncs4[] =
+    {
+        vmax4<unsigned int>,
+        vmax4<int>,
+        0,
+        0
+    };
+    static const func_t vfuncs2[] =
+    {
+        0,
+        0,
+        vmax2<unsigned int>,
+        vmax2<int>
+    };
+
+    const int depth = src1.depth();
+    const int cn = src1.channels();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    dst.create(src1.size(), src1.type());
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+
+    if (depth < CV_32S)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        {
+            const func_t vfunc4 = vfuncs4[depth];
+            const func_t vfunc2 = vfuncs2[depth];
+
+            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            {
+                const int vcols = src1_.cols >> 2;
+
+                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+
+            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            {
+                const int vcols = src1_.cols >> 1;
+
+                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+        }
+    }
+
+    const func_t func = funcs[depth];
+
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, stream);
 }
 
 namespace
 {
-    template <typename T> void minScalar(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> double castScalar(double val)
     {
-        cv::gpu::device::min_gpu(src, saturate_cast<T>(val), dst, stream);
-    }
-
-    template <typename T> void maxScalar(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream)
-    {
-        cv::gpu::device::max_gpu(src, saturate_cast<T>(val), dst, stream);
+        return saturate_cast<T>(val);
     }
 }
 
 void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
 {
-    typedef void (*func_t)(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    using namespace arithm;
+
+    typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
         minScalar<unsigned char>,
@@ -1994,23 +2763,33 @@ void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
         minScalar<double>
     };
 
-    CV_Assert(src.depth() <= CV_64F);
-    CV_Assert(src.channels() == 1);
-
-    if (src.depth() == CV_64F)
+    typedef double (*cast_func_t)(double sc);
+    static const cast_func_t cast_func[] =
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
+    };
+
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()](src, val, dst, StreamAccessor::getStream(stream));
+    funcs[depth](src, cast_func[depth](val), dst, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
 {
-    typedef void (*func_t)(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    using namespace arithm;
+
+    typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
         maxScalar<unsigned char>,
@@ -2022,45 +2801,47 @@ void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
         maxScalar<double>
     };
 
-    CV_Assert(src.depth() <= CV_64F);
-    CV_Assert(src.channels() == 1);
-
-    if (src.depth() == CV_64F)
+    typedef double (*cast_func_t)(double sc);
+    static const cast_func_t cast_func[] =
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
+    };
+
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()](src, val, dst, StreamAccessor::getStream(stream));
+    funcs[depth](src, cast_func[depth](val), dst, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
 // threshold
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
     template <typename T>
-    void threshold_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, T thresh, T maxVal, int type, cudaStream_t stream);
-}}}
-
-namespace
-{
-    template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream)
-    {
-        cv::gpu::device::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
-    }
+    void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
 }
 
 double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& s)
 {
-    CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
-    CV_Assert(type <= THRESH_TOZERO_INV);
+    const int depth = src.depth();
 
-    if (src.depth() == CV_64F)
+    CV_Assert( src.channels() == 1 && depth <= CV_64F );
+    CV_Assert( type <= THRESH_TOZERO_INV );
+
+    if (depth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
@@ -2084,21 +2865,25 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
     }
     else
     {
-        typedef void (*func_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream);
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
         static const func_t funcs[] =
         {
-            threshold_caller<unsigned char>, threshold_caller<signed char>,
-            threshold_caller<unsigned short>, threshold_caller<short>,
-            threshold_caller<int>, threshold_caller<float>, threshold_caller<double>
+            arithm::threshold<unsigned char>,
+            arithm::threshold<signed char>,
+            arithm::threshold<unsigned short>,
+            arithm::threshold<short>,
+            arithm::threshold<int>,
+            arithm::threshold<float>,
+            arithm::threshold<double>
         };
 
-        if (src.depth() != CV_32F && src.depth() != CV_64F)
+        if (depth != CV_32F && depth != CV_64F)
         {
             thresh = cvFloor(thresh);
             maxVal = cvRound(maxVal);
         }
 
-        funcs[src.depth()](src, dst, thresh, maxVal, type, stream);
+        funcs[depth](src, dst, thresh, maxVal, type, stream);
     }
 
     return thresh;
@@ -2107,34 +2892,42 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
 ////////////////////////////////////////////////////////////////////////
 // pow
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
-    template<typename T> void pow_caller(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-}}}
+    template<typename T> void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+}
 
 void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
 {
-    using namespace cv::gpu::device;
-
     typedef void (*func_t)(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        pow_caller<unsigned char>,  pow_caller<signed char>,
-        pow_caller<unsigned short>, pow_caller<short>,
-        pow_caller<int>, pow_caller<float>, pow_caller<double>
+        arithm::pow<unsigned char>,
+        arithm::pow<signed char>,
+        arithm::pow<unsigned short>,
+        arithm::pow<short>,
+        arithm::pow<int>,
+        arithm::pow<float>,
+        arithm::pow<double>
     };
 
-    CV_Assert(src.depth() <= CV_64F);
+    const int depth = src.depth();
+    const int cn = src.channels();
 
-    if (src.depth() == CV_64F)
+    CV_Assert(depth <= CV_64F);
+
+    if (depth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()](src.reshape(1), power, dst.reshape(1), StreamAccessor::getStream(stream));
+    PtrStepSzb src_(src.rows, src.cols * cn, src.data, src.step);
+    PtrStepSzb dst_(src.rows, src.cols * cn, dst.data, dst.step);
+
+    funcs[depth](src_, power, dst_, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -2200,8 +2993,8 @@ void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int
         NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R>::call
     };
 
-    CV_Assert(img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4);
-    CV_Assert(img1.size() == img2.size() && img1.type() == img2.type());
+    CV_Assert( img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4 );
+    CV_Assert( img1.size() == img2.size() && img1.type() == img2.type() );
 
     dst.create(img1.size(), img1.type());
 
@@ -2213,507 +3006,508 @@ void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int
 ////////////////////////////////////////////////////////////////////////
 // addWeighted
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
     template <typename T1, typename T2, typename D>
-    void addWeighted_gpu(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-}}}
+    void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+}
 
-void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype, Stream& stream)
+void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int ddepth, Stream& stream)
 {
-    using namespace cv::gpu::device;
-
-    typedef void (*func_t)(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
+    typedef void (*func_t)(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][7][7] =
     {
         {
             {
-                addWeighted_gpu<unsigned char, unsigned char, unsigned char >,
-                addWeighted_gpu<unsigned char, unsigned char, signed char >,
-                addWeighted_gpu<unsigned char, unsigned char, unsigned short>,
-                addWeighted_gpu<unsigned char, unsigned char, short >,
-                addWeighted_gpu<unsigned char, unsigned char, int   >,
-                addWeighted_gpu<unsigned char, unsigned char, float >,
-                addWeighted_gpu<unsigned char, unsigned char, double>
+                arithm::addWeighted<unsigned char, unsigned char, unsigned char >,
+                arithm::addWeighted<unsigned char, unsigned char, signed char >,
+                arithm::addWeighted<unsigned char, unsigned char, unsigned short>,
+                arithm::addWeighted<unsigned char, unsigned char, short >,
+                arithm::addWeighted<unsigned char, unsigned char, int   >,
+                arithm::addWeighted<unsigned char, unsigned char, float >,
+                arithm::addWeighted<unsigned char, unsigned char, double>
             },
             {
-                addWeighted_gpu<unsigned char, signed char, unsigned char >,
-                addWeighted_gpu<unsigned char, signed char, signed char >,
-                addWeighted_gpu<unsigned char, signed char, unsigned short>,
-                addWeighted_gpu<unsigned char, signed char, short >,
-                addWeighted_gpu<unsigned char, signed char, int   >,
-                addWeighted_gpu<unsigned char, signed char, float >,
-                addWeighted_gpu<unsigned char, signed char, double>
+                arithm::addWeighted<unsigned char, signed char, unsigned char >,
+                arithm::addWeighted<unsigned char, signed char, signed char >,
+                arithm::addWeighted<unsigned char, signed char, unsigned short>,
+                arithm::addWeighted<unsigned char, signed char, short >,
+                arithm::addWeighted<unsigned char, signed char, int   >,
+                arithm::addWeighted<unsigned char, signed char, float >,
+                arithm::addWeighted<unsigned char, signed char, double>
             },
             {
-                addWeighted_gpu<unsigned char, unsigned short, unsigned char >,
-                addWeighted_gpu<unsigned char, unsigned short, signed char >,
-                addWeighted_gpu<unsigned char, unsigned short, unsigned short>,
-                addWeighted_gpu<unsigned char, unsigned short, short >,
-                addWeighted_gpu<unsigned char, unsigned short, int   >,
-                addWeighted_gpu<unsigned char, unsigned short, float >,
-                addWeighted_gpu<unsigned char, unsigned short, double>
+                arithm::addWeighted<unsigned char, unsigned short, unsigned char >,
+                arithm::addWeighted<unsigned char, unsigned short, signed char >,
+                arithm::addWeighted<unsigned char, unsigned short, unsigned short>,
+                arithm::addWeighted<unsigned char, unsigned short, short >,
+                arithm::addWeighted<unsigned char, unsigned short, int   >,
+                arithm::addWeighted<unsigned char, unsigned short, float >,
+                arithm::addWeighted<unsigned char, unsigned short, double>
             },
             {
-                addWeighted_gpu<unsigned char, short, unsigned char >,
-                addWeighted_gpu<unsigned char, short, signed char >,
-                addWeighted_gpu<unsigned char, short, unsigned short>,
-                addWeighted_gpu<unsigned char, short, short >,
-                addWeighted_gpu<unsigned char, short, int   >,
-                addWeighted_gpu<unsigned char, short, float >,
-                addWeighted_gpu<unsigned char, short, double>
+                arithm::addWeighted<unsigned char, short, unsigned char >,
+                arithm::addWeighted<unsigned char, short, signed char >,
+                arithm::addWeighted<unsigned char, short, unsigned short>,
+                arithm::addWeighted<unsigned char, short, short >,
+                arithm::addWeighted<unsigned char, short, int   >,
+                arithm::addWeighted<unsigned char, short, float >,
+                arithm::addWeighted<unsigned char, short, double>
             },
             {
-                addWeighted_gpu<unsigned char, int, unsigned char >,
-                addWeighted_gpu<unsigned char, int, signed char >,
-                addWeighted_gpu<unsigned char, int, unsigned short>,
-                addWeighted_gpu<unsigned char, int, short >,
-                addWeighted_gpu<unsigned char, int, int   >,
-                addWeighted_gpu<unsigned char, int, float >,
-                addWeighted_gpu<unsigned char, int, double>
+                arithm::addWeighted<unsigned char, int, unsigned char >,
+                arithm::addWeighted<unsigned char, int, signed char >,
+                arithm::addWeighted<unsigned char, int, unsigned short>,
+                arithm::addWeighted<unsigned char, int, short >,
+                arithm::addWeighted<unsigned char, int, int   >,
+                arithm::addWeighted<unsigned char, int, float >,
+                arithm::addWeighted<unsigned char, int, double>
             },
             {
-                addWeighted_gpu<unsigned char, float, unsigned char >,
-                addWeighted_gpu<unsigned char, float, signed char >,
-                addWeighted_gpu<unsigned char, float, unsigned short>,
-                addWeighted_gpu<unsigned char, float, short >,
-                addWeighted_gpu<unsigned char, float, int   >,
-                addWeighted_gpu<unsigned char, float, float >,
-                addWeighted_gpu<unsigned char, float, double>
+                arithm::addWeighted<unsigned char, float, unsigned char >,
+                arithm::addWeighted<unsigned char, float, signed char >,
+                arithm::addWeighted<unsigned char, float, unsigned short>,
+                arithm::addWeighted<unsigned char, float, short >,
+                arithm::addWeighted<unsigned char, float, int   >,
+                arithm::addWeighted<unsigned char, float, float >,
+                arithm::addWeighted<unsigned char, float, double>
             },
             {
-                addWeighted_gpu<unsigned char, double, unsigned char >,
-                addWeighted_gpu<unsigned char, double, signed char >,
-                addWeighted_gpu<unsigned char, double, unsigned short>,
-                addWeighted_gpu<unsigned char, double, short >,
-                addWeighted_gpu<unsigned char, double, int   >,
-                addWeighted_gpu<unsigned char, double, float >,
-                addWeighted_gpu<unsigned char, double, double>
+                arithm::addWeighted<unsigned char, double, unsigned char >,
+                arithm::addWeighted<unsigned char, double, signed char >,
+                arithm::addWeighted<unsigned char, double, unsigned short>,
+                arithm::addWeighted<unsigned char, double, short >,
+                arithm::addWeighted<unsigned char, double, int   >,
+                arithm::addWeighted<unsigned char, double, float >,
+                arithm::addWeighted<unsigned char, double, double>
             }
         },
         {
             {
-                0/*addWeighted_gpu<signed char, unsigned char, unsigned char >*/,
-                0/*addWeighted_gpu<signed char, unsigned char, signed char >*/,
-                0/*addWeighted_gpu<signed char, unsigned char, unsigned short>*/,
-                0/*addWeighted_gpu<signed char, unsigned char, short >*/,
-                0/*addWeighted_gpu<signed char, unsigned char, int   >*/,
-                0/*addWeighted_gpu<signed char, unsigned char, float >*/,
-                0/*addWeighted_gpu<signed char, unsigned char, double>*/
+                0/*arithm::addWeighted<signed char, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<signed char, unsigned char, short >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, int   >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, float >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, double>*/
             },
             {
-                addWeighted_gpu<signed char, signed char, unsigned char >,
-                addWeighted_gpu<signed char, signed char, signed char >,
-                addWeighted_gpu<signed char, signed char, unsigned short>,
-                addWeighted_gpu<signed char, signed char, short >,
-                addWeighted_gpu<signed char, signed char, int   >,
-                addWeighted_gpu<signed char, signed char, float >,
-                addWeighted_gpu<signed char, signed char, double>
+                arithm::addWeighted<signed char, signed char, unsigned char >,
+                arithm::addWeighted<signed char, signed char, signed char >,
+                arithm::addWeighted<signed char, signed char, unsigned short>,
+                arithm::addWeighted<signed char, signed char, short >,
+                arithm::addWeighted<signed char, signed char, int   >,
+                arithm::addWeighted<signed char, signed char, float >,
+                arithm::addWeighted<signed char, signed char, double>
             },
             {
-                addWeighted_gpu<signed char, unsigned short, unsigned char >,
-                addWeighted_gpu<signed char, unsigned short, signed char >,
-                addWeighted_gpu<signed char, unsigned short, unsigned short>,
-                addWeighted_gpu<signed char, unsigned short, short >,
-                addWeighted_gpu<signed char, unsigned short, int   >,
-                addWeighted_gpu<signed char, unsigned short, float >,
-                addWeighted_gpu<signed char, unsigned short, double>
+                arithm::addWeighted<signed char, unsigned short, unsigned char >,
+                arithm::addWeighted<signed char, unsigned short, signed char >,
+                arithm::addWeighted<signed char, unsigned short, unsigned short>,
+                arithm::addWeighted<signed char, unsigned short, short >,
+                arithm::addWeighted<signed char, unsigned short, int   >,
+                arithm::addWeighted<signed char, unsigned short, float >,
+                arithm::addWeighted<signed char, unsigned short, double>
             },
             {
-                addWeighted_gpu<signed char, short, unsigned char >,
-                addWeighted_gpu<signed char, short, signed char >,
-                addWeighted_gpu<signed char, short, unsigned short>,
-                addWeighted_gpu<signed char, short, short >,
-                addWeighted_gpu<signed char, short, int   >,
-                addWeighted_gpu<signed char, short, float >,
-                addWeighted_gpu<signed char, short, double>
+                arithm::addWeighted<signed char, short, unsigned char >,
+                arithm::addWeighted<signed char, short, signed char >,
+                arithm::addWeighted<signed char, short, unsigned short>,
+                arithm::addWeighted<signed char, short, short >,
+                arithm::addWeighted<signed char, short, int   >,
+                arithm::addWeighted<signed char, short, float >,
+                arithm::addWeighted<signed char, short, double>
             },
             {
-                addWeighted_gpu<signed char, int, unsigned char >,
-                addWeighted_gpu<signed char, int, signed char >,
-                addWeighted_gpu<signed char, int, unsigned short>,
-                addWeighted_gpu<signed char, int, short >,
-                addWeighted_gpu<signed char, int, int   >,
-                addWeighted_gpu<signed char, int, float >,
-                addWeighted_gpu<signed char, int, double>
+                arithm::addWeighted<signed char, int, unsigned char >,
+                arithm::addWeighted<signed char, int, signed char >,
+                arithm::addWeighted<signed char, int, unsigned short>,
+                arithm::addWeighted<signed char, int, short >,
+                arithm::addWeighted<signed char, int, int   >,
+                arithm::addWeighted<signed char, int, float >,
+                arithm::addWeighted<signed char, int, double>
             },
             {
-                addWeighted_gpu<signed char, float, unsigned char >,
-                addWeighted_gpu<signed char, float, signed char >,
-                addWeighted_gpu<signed char, float, unsigned short>,
-                addWeighted_gpu<signed char, float, short >,
-                addWeighted_gpu<signed char, float, int   >,
-                addWeighted_gpu<signed char, float, float >,
-                addWeighted_gpu<signed char, float, double>
+                arithm::addWeighted<signed char, float, unsigned char >,
+                arithm::addWeighted<signed char, float, signed char >,
+                arithm::addWeighted<signed char, float, unsigned short>,
+                arithm::addWeighted<signed char, float, short >,
+                arithm::addWeighted<signed char, float, int   >,
+                arithm::addWeighted<signed char, float, float >,
+                arithm::addWeighted<signed char, float, double>
             },
             {
-                addWeighted_gpu<signed char, double, unsigned char >,
-                addWeighted_gpu<signed char, double, signed char >,
-                addWeighted_gpu<signed char, double, unsigned short>,
-                addWeighted_gpu<signed char, double, short >,
-                addWeighted_gpu<signed char, double, int   >,
-                addWeighted_gpu<signed char, double, float >,
-                addWeighted_gpu<signed char, double, double>
+                arithm::addWeighted<signed char, double, unsigned char >,
+                arithm::addWeighted<signed char, double, signed char >,
+                arithm::addWeighted<signed char, double, unsigned short>,
+                arithm::addWeighted<signed char, double, short >,
+                arithm::addWeighted<signed char, double, int   >,
+                arithm::addWeighted<signed char, double, float >,
+                arithm::addWeighted<signed char, double, double>
             }
         },
         {
             {
-                0/*addWeighted_gpu<unsigned short, unsigned char, unsigned char >*/,
-                0/*addWeighted_gpu<unsigned short, unsigned char, signed char >*/,
-                0/*addWeighted_gpu<unsigned short, unsigned char, unsigned short>*/,
-                0/*addWeighted_gpu<unsigned short, unsigned char, short >*/,
-                0/*addWeighted_gpu<unsigned short, unsigned char, int   >*/,
-                0/*addWeighted_gpu<unsigned short, unsigned char, float >*/,
-                0/*addWeighted_gpu<unsigned short, unsigned char, double>*/
+                0/*arithm::addWeighted<unsigned short, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, short >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, int   >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, float >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, double>*/
             },
             {
-                0/*addWeighted_gpu<unsigned short, signed char, unsigned char >*/,
-                0/*addWeighted_gpu<unsigned short, signed char, signed char >*/,
-                0/*addWeighted_gpu<unsigned short, signed char, unsigned short>*/,
-                0/*addWeighted_gpu<unsigned short, signed char, short >*/,
-                0/*addWeighted_gpu<unsigned short, signed char, int   >*/,
-                0/*addWeighted_gpu<unsigned short, signed char, float >*/,
-                0/*addWeighted_gpu<unsigned short, signed char, double>*/
+                0/*arithm::addWeighted<unsigned short, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, signed char >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned short, signed char, short >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, int   >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, float >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, double>*/
             },
             {
-                addWeighted_gpu<unsigned short, unsigned short, unsigned char >,
-                addWeighted_gpu<unsigned short, unsigned short, signed char >,
-                addWeighted_gpu<unsigned short, unsigned short, unsigned short>,
-                addWeighted_gpu<unsigned short, unsigned short, short >,
-                addWeighted_gpu<unsigned short, unsigned short, int   >,
-                addWeighted_gpu<unsigned short, unsigned short, float >,
-                addWeighted_gpu<unsigned short, unsigned short, double>
+                arithm::addWeighted<unsigned short, unsigned short, unsigned char >,
+                arithm::addWeighted<unsigned short, unsigned short, signed char >,
+                arithm::addWeighted<unsigned short, unsigned short, unsigned short>,
+                arithm::addWeighted<unsigned short, unsigned short, short >,
+                arithm::addWeighted<unsigned short, unsigned short, int   >,
+                arithm::addWeighted<unsigned short, unsigned short, float >,
+                arithm::addWeighted<unsigned short, unsigned short, double>
             },
             {
-                addWeighted_gpu<unsigned short, short, unsigned char >,
-                addWeighted_gpu<unsigned short, short, signed char >,
-                addWeighted_gpu<unsigned short, short, unsigned short>,
-                addWeighted_gpu<unsigned short, short, short >,
-                addWeighted_gpu<unsigned short, short, int   >,
-                addWeighted_gpu<unsigned short, short, float >,
-                addWeighted_gpu<unsigned short, short, double>
+                arithm::addWeighted<unsigned short, short, unsigned char >,
+                arithm::addWeighted<unsigned short, short, signed char >,
+                arithm::addWeighted<unsigned short, short, unsigned short>,
+                arithm::addWeighted<unsigned short, short, short >,
+                arithm::addWeighted<unsigned short, short, int   >,
+                arithm::addWeighted<unsigned short, short, float >,
+                arithm::addWeighted<unsigned short, short, double>
             },
             {
-                addWeighted_gpu<unsigned short, int, unsigned char >,
-                addWeighted_gpu<unsigned short, int, signed char >,
-                addWeighted_gpu<unsigned short, int, unsigned short>,
-                addWeighted_gpu<unsigned short, int, short >,
-                addWeighted_gpu<unsigned short, int, int   >,
-                addWeighted_gpu<unsigned short, int, float >,
-                addWeighted_gpu<unsigned short, int, double>
+                arithm::addWeighted<unsigned short, int, unsigned char >,
+                arithm::addWeighted<unsigned short, int, signed char >,
+                arithm::addWeighted<unsigned short, int, unsigned short>,
+                arithm::addWeighted<unsigned short, int, short >,
+                arithm::addWeighted<unsigned short, int, int   >,
+                arithm::addWeighted<unsigned short, int, float >,
+                arithm::addWeighted<unsigned short, int, double>
             },
             {
-                addWeighted_gpu<unsigned short, float, unsigned char >,
-                addWeighted_gpu<unsigned short, float, signed char >,
-                addWeighted_gpu<unsigned short, float, unsigned short>,
-                addWeighted_gpu<unsigned short, float, short >,
-                addWeighted_gpu<unsigned short, float, int   >,
-                addWeighted_gpu<unsigned short, float, float >,
-                addWeighted_gpu<unsigned short, float, double>
+                arithm::addWeighted<unsigned short, float, unsigned char >,
+                arithm::addWeighted<unsigned short, float, signed char >,
+                arithm::addWeighted<unsigned short, float, unsigned short>,
+                arithm::addWeighted<unsigned short, float, short >,
+                arithm::addWeighted<unsigned short, float, int   >,
+                arithm::addWeighted<unsigned short, float, float >,
+                arithm::addWeighted<unsigned short, float, double>
             },
             {
-                addWeighted_gpu<unsigned short, double, unsigned char >,
-                addWeighted_gpu<unsigned short, double, signed char >,
-                addWeighted_gpu<unsigned short, double, unsigned short>,
-                addWeighted_gpu<unsigned short, double, short >,
-                addWeighted_gpu<unsigned short, double, int   >,
-                addWeighted_gpu<unsigned short, double, float >,
-                addWeighted_gpu<unsigned short, double, double>
+                arithm::addWeighted<unsigned short, double, unsigned char >,
+                arithm::addWeighted<unsigned short, double, signed char >,
+                arithm::addWeighted<unsigned short, double, unsigned short>,
+                arithm::addWeighted<unsigned short, double, short >,
+                arithm::addWeighted<unsigned short, double, int   >,
+                arithm::addWeighted<unsigned short, double, float >,
+                arithm::addWeighted<unsigned short, double, double>
             }
         },
         {
             {
-                0/*addWeighted_gpu<short, unsigned char, unsigned char >*/,
-                0/*addWeighted_gpu<short, unsigned char, signed char >*/,
-                0/*addWeighted_gpu<short, unsigned char, unsigned short>*/,
-                0/*addWeighted_gpu<short, unsigned char, short >*/,
-                0/*addWeighted_gpu<short, unsigned char, int   >*/,
-                0/*addWeighted_gpu<short, unsigned char, float >*/,
-                0/*addWeighted_gpu<short, unsigned char, double>*/
+                0/*arithm::addWeighted<short, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<short, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<short, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<short, unsigned char, short >*/,
+                0/*arithm::addWeighted<short, unsigned char, int   >*/,
+                0/*arithm::addWeighted<short, unsigned char, float >*/,
+                0/*arithm::addWeighted<short, unsigned char, double>*/
             },
             {
-                0/*addWeighted_gpu<short, signed char, unsigned char >*/,
-                0/*addWeighted_gpu<short, signed char, signed char >*/,
-                0/*addWeighted_gpu<short, signed char, unsigned short>*/,
-                0/*addWeighted_gpu<short, signed char, short >*/,
-                0/*addWeighted_gpu<short, signed char, int   >*/,
-                0/*addWeighted_gpu<short, signed char, float >*/,
-                0/*addWeighted_gpu<short, signed char, double>*/
+                0/*arithm::addWeighted<short, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<short, signed char, signed char >*/,
+                0/*arithm::addWeighted<short, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<short, signed char, short >*/,
+                0/*arithm::addWeighted<short, signed char, int   >*/,
+                0/*arithm::addWeighted<short, signed char, float >*/,
+                0/*arithm::addWeighted<short, signed char, double>*/
             },
             {
-                0/*addWeighted_gpu<short, unsigned short, unsigned char >*/,
-                0/*addWeighted_gpu<short, unsigned short, signed char >*/,
-                0/*addWeighted_gpu<short, unsigned short, unsigned short>*/,
-                0/*addWeighted_gpu<short, unsigned short, short >*/,
-                0/*addWeighted_gpu<short, unsigned short, int   >*/,
-                0/*addWeighted_gpu<short, unsigned short, float >*/,
-                0/*addWeighted_gpu<short, unsigned short, double>*/
+                0/*arithm::addWeighted<short, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<short, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<short, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<short, unsigned short, short >*/,
+                0/*arithm::addWeighted<short, unsigned short, int   >*/,
+                0/*arithm::addWeighted<short, unsigned short, float >*/,
+                0/*arithm::addWeighted<short, unsigned short, double>*/
             },
             {
-                addWeighted_gpu<short, short, unsigned char >,
-                addWeighted_gpu<short, short, signed char >,
-                addWeighted_gpu<short, short, unsigned short>,
-                addWeighted_gpu<short, short, short >,
-                addWeighted_gpu<short, short, int   >,
-                addWeighted_gpu<short, short, float >,
-                addWeighted_gpu<short, short, double>
+                arithm::addWeighted<short, short, unsigned char >,
+                arithm::addWeighted<short, short, signed char >,
+                arithm::addWeighted<short, short, unsigned short>,
+                arithm::addWeighted<short, short, short >,
+                arithm::addWeighted<short, short, int   >,
+                arithm::addWeighted<short, short, float >,
+                arithm::addWeighted<short, short, double>
             },
             {
-                addWeighted_gpu<short, int, unsigned char >,
-                addWeighted_gpu<short, int, signed char >,
-                addWeighted_gpu<short, int, unsigned short>,
-                addWeighted_gpu<short, int, short >,
-                addWeighted_gpu<short, int, int   >,
-                addWeighted_gpu<short, int, float >,
-                addWeighted_gpu<short, int, double>
+                arithm::addWeighted<short, int, unsigned char >,
+                arithm::addWeighted<short, int, signed char >,
+                arithm::addWeighted<short, int, unsigned short>,
+                arithm::addWeighted<short, int, short >,
+                arithm::addWeighted<short, int, int   >,
+                arithm::addWeighted<short, int, float >,
+                arithm::addWeighted<short, int, double>
             },
             {
-                addWeighted_gpu<short, float, unsigned char >,
-                addWeighted_gpu<short, float, signed char >,
-                addWeighted_gpu<short, float, unsigned short>,
-                addWeighted_gpu<short, float, short >,
-                addWeighted_gpu<short, float, int   >,
-                addWeighted_gpu<short, float, float >,
-                addWeighted_gpu<short, float, double>
+                arithm::addWeighted<short, float, unsigned char >,
+                arithm::addWeighted<short, float, signed char >,
+                arithm::addWeighted<short, float, unsigned short>,
+                arithm::addWeighted<short, float, short >,
+                arithm::addWeighted<short, float, int   >,
+                arithm::addWeighted<short, float, float >,
+                arithm::addWeighted<short, float, double>
             },
             {
-                addWeighted_gpu<short, double, unsigned char >,
-                addWeighted_gpu<short, double, signed char >,
-                addWeighted_gpu<short, double, unsigned short>,
-                addWeighted_gpu<short, double, short >,
-                addWeighted_gpu<short, double, int   >,
-                addWeighted_gpu<short, double, float >,
-                addWeighted_gpu<short, double, double>
+                arithm::addWeighted<short, double, unsigned char >,
+                arithm::addWeighted<short, double, signed char >,
+                arithm::addWeighted<short, double, unsigned short>,
+                arithm::addWeighted<short, double, short >,
+                arithm::addWeighted<short, double, int   >,
+                arithm::addWeighted<short, double, float >,
+                arithm::addWeighted<short, double, double>
             }
         },
         {
             {
-                0/*addWeighted_gpu<int, unsigned char, unsigned char >*/,
-                0/*addWeighted_gpu<int, unsigned char, signed char >*/,
-                0/*addWeighted_gpu<int, unsigned char, unsigned short>*/,
-                0/*addWeighted_gpu<int, unsigned char, short >*/,
-                0/*addWeighted_gpu<int, unsigned char, int   >*/,
-                0/*addWeighted_gpu<int, unsigned char, float >*/,
-                0/*addWeighted_gpu<int, unsigned char, double>*/
+                0/*arithm::addWeighted<int, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<int, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<int, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<int, unsigned char, short >*/,
+                0/*arithm::addWeighted<int, unsigned char, int   >*/,
+                0/*arithm::addWeighted<int, unsigned char, float >*/,
+                0/*arithm::addWeighted<int, unsigned char, double>*/
             },
             {
-                0/*addWeighted_gpu<int, signed char, unsigned char >*/,
-                0/*addWeighted_gpu<int, signed char, signed char >*/,
-                0/*addWeighted_gpu<int, signed char, unsigned short>*/,
-                0/*addWeighted_gpu<int, signed char, short >*/,
-                0/*addWeighted_gpu<int, signed char, int   >*/,
-                0/*addWeighted_gpu<int, signed char, float >*/,
-                0/*addWeighted_gpu<int, signed char, double>*/
+                0/*arithm::addWeighted<int, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<int, signed char, signed char >*/,
+                0/*arithm::addWeighted<int, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<int, signed char, short >*/,
+                0/*arithm::addWeighted<int, signed char, int   >*/,
+                0/*arithm::addWeighted<int, signed char, float >*/,
+                0/*arithm::addWeighted<int, signed char, double>*/
             },
             {
-                0/*addWeighted_gpu<int, unsigned short, unsigned char >*/,
-                0/*addWeighted_gpu<int, unsigned short, signed char >*/,
-                0/*addWeighted_gpu<int, unsigned short, unsigned short>*/,
-                0/*addWeighted_gpu<int, unsigned short, short >*/,
-                0/*addWeighted_gpu<int, unsigned short, int   >*/,
-                0/*addWeighted_gpu<int, unsigned short, float >*/,
-                0/*addWeighted_gpu<int, unsigned short, double>*/
+                0/*arithm::addWeighted<int, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<int, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<int, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<int, unsigned short, short >*/,
+                0/*arithm::addWeighted<int, unsigned short, int   >*/,
+                0/*arithm::addWeighted<int, unsigned short, float >*/,
+                0/*arithm::addWeighted<int, unsigned short, double>*/
             },
             {
-                0/*addWeighted_gpu<int, short, unsigned char >*/,
-                0/*addWeighted_gpu<int, short, signed char >*/,
-                0/*addWeighted_gpu<int, short, unsigned short>*/,
-                0/*addWeighted_gpu<int, short, short >*/,
-                0/*addWeighted_gpu<int, short, int   >*/,
-                0/*addWeighted_gpu<int, short, float >*/,
-                0/*addWeighted_gpu<int, short, double>*/
+                0/*arithm::addWeighted<int, short, unsigned char >*/,
+                0/*arithm::addWeighted<int, short, signed char >*/,
+                0/*arithm::addWeighted<int, short, unsigned short>*/,
+                0/*arithm::addWeighted<int, short, short >*/,
+                0/*arithm::addWeighted<int, short, int   >*/,
+                0/*arithm::addWeighted<int, short, float >*/,
+                0/*arithm::addWeighted<int, short, double>*/
             },
             {
-                addWeighted_gpu<int, int, unsigned char >,
-                addWeighted_gpu<int, int, signed char >,
-                addWeighted_gpu<int, int, unsigned short>,
-                addWeighted_gpu<int, int, short >,
-                addWeighted_gpu<int, int, int   >,
-                addWeighted_gpu<int, int, float >,
-                addWeighted_gpu<int, int, double>
+                arithm::addWeighted<int, int, unsigned char >,
+                arithm::addWeighted<int, int, signed char >,
+                arithm::addWeighted<int, int, unsigned short>,
+                arithm::addWeighted<int, int, short >,
+                arithm::addWeighted<int, int, int   >,
+                arithm::addWeighted<int, int, float >,
+                arithm::addWeighted<int, int, double>
             },
             {
-                addWeighted_gpu<int, float, unsigned char >,
-                addWeighted_gpu<int, float, signed char >,
-                addWeighted_gpu<int, float, unsigned short>,
-                addWeighted_gpu<int, float, short >,
-                addWeighted_gpu<int, float, int   >,
-                addWeighted_gpu<int, float, float >,
-                addWeighted_gpu<int, float, double>
+                arithm::addWeighted<int, float, unsigned char >,
+                arithm::addWeighted<int, float, signed char >,
+                arithm::addWeighted<int, float, unsigned short>,
+                arithm::addWeighted<int, float, short >,
+                arithm::addWeighted<int, float, int   >,
+                arithm::addWeighted<int, float, float >,
+                arithm::addWeighted<int, float, double>
             },
             {
-                addWeighted_gpu<int, double, unsigned char >,
-                addWeighted_gpu<int, double, signed char >,
-                addWeighted_gpu<int, double, unsigned short>,
-                addWeighted_gpu<int, double, short >,
-                addWeighted_gpu<int, double, int   >,
-                addWeighted_gpu<int, double, float >,
-                addWeighted_gpu<int, double, double>
+                arithm::addWeighted<int, double, unsigned char >,
+                arithm::addWeighted<int, double, signed char >,
+                arithm::addWeighted<int, double, unsigned short>,
+                arithm::addWeighted<int, double, short >,
+                arithm::addWeighted<int, double, int   >,
+                arithm::addWeighted<int, double, float >,
+                arithm::addWeighted<int, double, double>
             }
         },
         {
             {
-                0/*addWeighted_gpu<float, unsigned char, unsigned char >*/,
-                0/*addWeighted_gpu<float, unsigned char, signed char >*/,
-                0/*addWeighted_gpu<float, unsigned char, unsigned short>*/,
-                0/*addWeighted_gpu<float, unsigned char, short >*/,
-                0/*addWeighted_gpu<float, unsigned char, int   >*/,
-                0/*addWeighted_gpu<float, unsigned char, float >*/,
-                0/*addWeighted_gpu<float, unsigned char, double>*/
+                0/*arithm::addWeighted<float, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<float, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<float, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<float, unsigned char, short >*/,
+                0/*arithm::addWeighted<float, unsigned char, int   >*/,
+                0/*arithm::addWeighted<float, unsigned char, float >*/,
+                0/*arithm::addWeighted<float, unsigned char, double>*/
             },
             {
-                0/*addWeighted_gpu<float, signed char, unsigned char >*/,
-                0/*addWeighted_gpu<float, signed char, signed char >*/,
-                0/*addWeighted_gpu<float, signed char, unsigned short>*/,
-                0/*addWeighted_gpu<float, signed char, short >*/,
-                0/*addWeighted_gpu<float, signed char, int   >*/,
-                0/*addWeighted_gpu<float, signed char, float >*/,
-                0/*addWeighted_gpu<float, signed char, double>*/
+                0/*arithm::addWeighted<float, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<float, signed char, signed char >*/,
+                0/*arithm::addWeighted<float, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<float, signed char, short >*/,
+                0/*arithm::addWeighted<float, signed char, int   >*/,
+                0/*arithm::addWeighted<float, signed char, float >*/,
+                0/*arithm::addWeighted<float, signed char, double>*/
             },
             {
-                0/*addWeighted_gpu<float, unsigned short, unsigned char >*/,
-                0/*addWeighted_gpu<float, unsigned short, signed char >*/,
-                0/*addWeighted_gpu<float, unsigned short, unsigned short>*/,
-                0/*addWeighted_gpu<float, unsigned short, short >*/,
-                0/*addWeighted_gpu<float, unsigned short, int   >*/,
-                0/*addWeighted_gpu<float, unsigned short, float >*/,
-                0/*addWeighted_gpu<float, unsigned short, double>*/
+                0/*arithm::addWeighted<float, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<float, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<float, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<float, unsigned short, short >*/,
+                0/*arithm::addWeighted<float, unsigned short, int   >*/,
+                0/*arithm::addWeighted<float, unsigned short, float >*/,
+                0/*arithm::addWeighted<float, unsigned short, double>*/
             },
             {
-                0/*addWeighted_gpu<float, short, unsigned char >*/,
-                0/*addWeighted_gpu<float, short, signed char >*/,
-                0/*addWeighted_gpu<float, short, unsigned short>*/,
-                0/*addWeighted_gpu<float, short, short >*/,
-                0/*addWeighted_gpu<float, short, int   >*/,
-                0/*addWeighted_gpu<float, short, float >*/,
-                0/*addWeighted_gpu<float, short, double>*/
+                0/*arithm::addWeighted<float, short, unsigned char >*/,
+                0/*arithm::addWeighted<float, short, signed char >*/,
+                0/*arithm::addWeighted<float, short, unsigned short>*/,
+                0/*arithm::addWeighted<float, short, short >*/,
+                0/*arithm::addWeighted<float, short, int   >*/,
+                0/*arithm::addWeighted<float, short, float >*/,
+                0/*arithm::addWeighted<float, short, double>*/
             },
             {
-                0/*addWeighted_gpu<float, int, unsigned char >*/,
-                0/*addWeighted_gpu<float, int, signed char >*/,
-                0/*addWeighted_gpu<float, int, unsigned short>*/,
-                0/*addWeighted_gpu<float, int, short >*/,
-                0/*addWeighted_gpu<float, int, int   >*/,
-                0/*addWeighted_gpu<float, int, float >*/,
-                0/*addWeighted_gpu<float, int, double>*/
+                0/*arithm::addWeighted<float, int, unsigned char >*/,
+                0/*arithm::addWeighted<float, int, signed char >*/,
+                0/*arithm::addWeighted<float, int, unsigned short>*/,
+                0/*arithm::addWeighted<float, int, short >*/,
+                0/*arithm::addWeighted<float, int, int   >*/,
+                0/*arithm::addWeighted<float, int, float >*/,
+                0/*arithm::addWeighted<float, int, double>*/
             },
             {
-                addWeighted_gpu<float, float, unsigned char >,
-                addWeighted_gpu<float, float, signed char >,
-                addWeighted_gpu<float, float, unsigned short>,
-                addWeighted_gpu<float, float, short >,
-                addWeighted_gpu<float, float, int   >,
-                addWeighted_gpu<float, float, float >,
-                addWeighted_gpu<float, float, double>
+                arithm::addWeighted<float, float, unsigned char >,
+                arithm::addWeighted<float, float, signed char >,
+                arithm::addWeighted<float, float, unsigned short>,
+                arithm::addWeighted<float, float, short >,
+                arithm::addWeighted<float, float, int   >,
+                arithm::addWeighted<float, float, float >,
+                arithm::addWeighted<float, float, double>
             },
             {
-                addWeighted_gpu<float, double, unsigned char >,
-                addWeighted_gpu<float, double, signed char >,
-                addWeighted_gpu<float, double, unsigned short>,
-                addWeighted_gpu<float, double, short >,
-                addWeighted_gpu<float, double, int   >,
-                addWeighted_gpu<float, double, float >,
-                addWeighted_gpu<float, double, double>
+                arithm::addWeighted<float, double, unsigned char >,
+                arithm::addWeighted<float, double, signed char >,
+                arithm::addWeighted<float, double, unsigned short>,
+                arithm::addWeighted<float, double, short >,
+                arithm::addWeighted<float, double, int   >,
+                arithm::addWeighted<float, double, float >,
+                arithm::addWeighted<float, double, double>
             }
         },
         {
             {
-                0/*addWeighted_gpu<double, unsigned char, unsigned char >*/,
-                0/*addWeighted_gpu<double, unsigned char, signed char >*/,
-                0/*addWeighted_gpu<double, unsigned char, unsigned short>*/,
-                0/*addWeighted_gpu<double, unsigned char, short >*/,
-                0/*addWeighted_gpu<double, unsigned char, int   >*/,
-                0/*addWeighted_gpu<double, unsigned char, float >*/,
-                0/*addWeighted_gpu<double, unsigned char, double>*/
+                0/*arithm::addWeighted<double, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<double, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<double, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<double, unsigned char, short >*/,
+                0/*arithm::addWeighted<double, unsigned char, int   >*/,
+                0/*arithm::addWeighted<double, unsigned char, float >*/,
+                0/*arithm::addWeighted<double, unsigned char, double>*/
             },
             {
-                0/*addWeighted_gpu<double, signed char, unsigned char >*/,
-                0/*addWeighted_gpu<double, signed char, signed char >*/,
-                0/*addWeighted_gpu<double, signed char, unsigned short>*/,
-                0/*addWeighted_gpu<double, signed char, short >*/,
-                0/*addWeighted_gpu<double, signed char, int   >*/,
-                0/*addWeighted_gpu<double, signed char, float >*/,
-                0/*addWeighted_gpu<double, signed char, double>*/
+                0/*arithm::addWeighted<double, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<double, signed char, signed char >*/,
+                0/*arithm::addWeighted<double, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<double, signed char, short >*/,
+                0/*arithm::addWeighted<double, signed char, int   >*/,
+                0/*arithm::addWeighted<double, signed char, float >*/,
+                0/*arithm::addWeighted<double, signed char, double>*/
             },
             {
-                0/*addWeighted_gpu<double, unsigned short, unsigned char >*/,
-                0/*addWeighted_gpu<double, unsigned short, signed char >*/,
-                0/*addWeighted_gpu<double, unsigned short, unsigned short>*/,
-                0/*addWeighted_gpu<double, unsigned short, short >*/,
-                0/*addWeighted_gpu<double, unsigned short, int   >*/,
-                0/*addWeighted_gpu<double, unsigned short, float >*/,
-                0/*addWeighted_gpu<double, unsigned short, double>*/
+                0/*arithm::addWeighted<double, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<double, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<double, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<double, unsigned short, short >*/,
+                0/*arithm::addWeighted<double, unsigned short, int   >*/,
+                0/*arithm::addWeighted<double, unsigned short, float >*/,
+                0/*arithm::addWeighted<double, unsigned short, double>*/
             },
             {
-                0/*addWeighted_gpu<double, short, unsigned char >*/,
-                0/*addWeighted_gpu<double, short, signed char >*/,
-                0/*addWeighted_gpu<double, short, unsigned short>*/,
-                0/*addWeighted_gpu<double, short, short >*/,
-                0/*addWeighted_gpu<double, short, int   >*/,
-                0/*addWeighted_gpu<double, short, float >*/,
-                0/*addWeighted_gpu<double, short, double>*/
+                0/*arithm::addWeighted<double, short, unsigned char >*/,
+                0/*arithm::addWeighted<double, short, signed char >*/,
+                0/*arithm::addWeighted<double, short, unsigned short>*/,
+                0/*arithm::addWeighted<double, short, short >*/,
+                0/*arithm::addWeighted<double, short, int   >*/,
+                0/*arithm::addWeighted<double, short, float >*/,
+                0/*arithm::addWeighted<double, short, double>*/
             },
             {
-                0/*addWeighted_gpu<double, int, unsigned char >*/,
-                0/*addWeighted_gpu<double, int, signed char >*/,
-                0/*addWeighted_gpu<double, int, unsigned short>*/,
-                0/*addWeighted_gpu<double, int, short >*/,
-                0/*addWeighted_gpu<double, int, int   >*/,
-                0/*addWeighted_gpu<double, int, float >*/,
-                0/*addWeighted_gpu<double, int, double>*/
+                0/*arithm::addWeighted<double, int, unsigned char >*/,
+                0/*arithm::addWeighted<double, int, signed char >*/,
+                0/*arithm::addWeighted<double, int, unsigned short>*/,
+                0/*arithm::addWeighted<double, int, short >*/,
+                0/*arithm::addWeighted<double, int, int   >*/,
+                0/*arithm::addWeighted<double, int, float >*/,
+                0/*arithm::addWeighted<double, int, double>*/
             },
             {
-                0/*addWeighted_gpu<double, float, unsigned char >*/,
-                0/*addWeighted_gpu<double, float, signed char >*/,
-                0/*addWeighted_gpu<double, float, unsigned short>*/,
-                0/*addWeighted_gpu<double, float, short >*/,
-                0/*addWeighted_gpu<double, float, int   >*/,
-                0/*addWeighted_gpu<double, float, float >*/,
-                0/*addWeighted_gpu<double, float, double>*/
+                0/*arithm::addWeighted<double, float, unsigned char >*/,
+                0/*arithm::addWeighted<double, float, signed char >*/,
+                0/*arithm::addWeighted<double, float, unsigned short>*/,
+                0/*arithm::addWeighted<double, float, short >*/,
+                0/*arithm::addWeighted<double, float, int   >*/,
+                0/*arithm::addWeighted<double, float, float >*/,
+                0/*arithm::addWeighted<double, float, double>*/
             },
             {
-                addWeighted_gpu<double, double, unsigned char >,
-                addWeighted_gpu<double, double, signed char >,
-                addWeighted_gpu<double, double, unsigned short>,
-                addWeighted_gpu<double, double, short >,
-                addWeighted_gpu<double, double, int   >,
-                addWeighted_gpu<double, double, float >,
-                addWeighted_gpu<double, double, double>
+                arithm::addWeighted<double, double, unsigned char >,
+                arithm::addWeighted<double, double, signed char >,
+                arithm::addWeighted<double, double, unsigned short>,
+                arithm::addWeighted<double, double, short >,
+                arithm::addWeighted<double, double, int   >,
+                arithm::addWeighted<double, double, float >,
+                arithm::addWeighted<double, double, double>
             }
         }
     };
 
-    CV_Assert(src1.size() == src2.size());
-    CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels()));
+    int sdepth1 = src1.depth();
+    int sdepth2 = src2.depth();
+    ddepth = ddepth >= 0 ? CV_MAT_DEPTH(ddepth) : std::max(sdepth1, sdepth2);
+    const int cn = src1.channels();
 
-    dtype = dtype >= 0 ? CV_MAKETYPE(dtype, src1.channels()) : src1.type();
+    CV_Assert( src2.size() == src1.size() && src2.channels() == cn );
+    CV_Assert( sdepth1 <= CV_64F && sdepth2 <= CV_64F && ddepth <= CV_64F );
 
-    CV_Assert(src1.depth() <= CV_64F && src2.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-
-    if (src1.depth() == CV_64F || src2.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    if (sdepth1 == CV_64F || sdepth2 == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src1.size(), dtype);
+    dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
 
-    const GpuMat* psrc1 = &src1;
-    const GpuMat* psrc2 = &src2;
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-    if (src1.depth() > src2.depth())
+    if (sdepth1 > sdepth2)
     {
-        std::swap(psrc1, psrc2);
+        std::swap(src1_.data, src2_.data);
+        std::swap(src1_.step, src2_.step);
         std::swap(alpha, beta);
+        std::swap(sdepth1, sdepth2);
     }
 
-    const func_t func = funcs[psrc1->depth()][psrc2->depth()][dst.depth()];
+    const func_t func = funcs[sdepth1][sdepth2][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(psrc1->reshape(1), alpha, psrc2->reshape(1), beta, gamma, dst.reshape(1), StreamAccessor::getStream(stream));
+    func(src1_, alpha, src2_, beta, gamma, dst_, StreamAccessor::getStream(stream));
 }
 
 #endif
diff --git a/modules/gpu/src/fast.cpp b/modules/gpu/src/fast.cpp
index d09210da4a..f8b3b98871 100644
--- a/modules/gpu/src/fast.cpp
+++ b/modules/gpu/src/fast.cpp
@@ -125,9 +125,6 @@ int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat& img, const GpuMat& ma
     CV_Assert(img.type() == CV_8UC1);
     CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
 
-    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
     int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());
 
     ensureSizeIsEnough(1, maxKeypoints, CV_16SC2, kpLoc_);
@@ -148,9 +145,6 @@ int cv::gpu::FAST_GPU::getKeyPoints(GpuMat& keypoints)
 {
     using namespace cv::gpu::device::fast;
 
-    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
     if (count_ == 0)
         return 0;
 
diff --git a/modules/gpu/src/ffmpeg_video_source.cpp b/modules/gpu/src/ffmpeg_video_source.cpp
index a0f420507e..42327fef38 100644
--- a/modules/gpu/src/ffmpeg_video_source.cpp
+++ b/modules/gpu/src/ffmpeg_video_source.cpp
@@ -42,9 +42,9 @@
 
 #include "ffmpeg_video_source.h"
 
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
 
-#ifdef HAVE_FFMPEG
+#if defined(HAVE_FFMPEG) && defined(BUILD_SHARED_LIBS)
     #include "cap_ffmpeg_impl.hpp"
 #else
     #include "cap_ffmpeg_api.hpp"
diff --git a/modules/gpu/src/ffmpeg_video_source.h b/modules/gpu/src/ffmpeg_video_source.h
index 66bb7c71ca..41bf0cfd0e 100644
--- a/modules/gpu/src/ffmpeg_video_source.h
+++ b/modules/gpu/src/ffmpeg_video_source.h
@@ -46,7 +46,7 @@
 #include "precomp.hpp"
 #include "thread_wrappers.h"
 
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
 
 struct InputMediaStream_FFMPEG;
 
diff --git a/modules/gpu/src/fgd_bgfg.cpp b/modules/gpu/src/fgd_bgfg.cpp
index 6bd7cab291..1576f69d1f 100644
--- a/modules/gpu/src/fgd_bgfg.cpp
+++ b/modules/gpu/src/fgd_bgfg.cpp
@@ -336,7 +336,7 @@ namespace
 {
     void calcDiffHistogram(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::gpu::GpuMat& hist, cv::gpu::GpuMat& histBuf)
     {
-        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
         static const func_t funcs[4][4] =
         {
             {0,0,0,0},
@@ -348,14 +348,11 @@ namespace
         hist.create(3, 256, CV_32SC1);
         histBuf.create(3, bgfg::PARTIAL_HISTOGRAM_COUNT * bgfg::HISTOGRAM_BIN_COUNT, CV_32SC1);
 
-        cv::gpu::DeviceInfo devInfo;
-        int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
-
         funcs[prevFrame.channels() - 1][curFrame.channels() - 1](
                     prevFrame, curFrame,
                     hist.ptr<unsigned int>(0), hist.ptr<unsigned int>(1), hist.ptr<unsigned int>(2),
                     histBuf.ptr<unsigned int>(0), histBuf.ptr<unsigned int>(1), histBuf.ptr<unsigned int>(2),
-                    cc, 0);
+                    cv::gpu::deviceSupports(cv::gpu::FEATURE_SET_COMPUTE_20), 0);
     }
 
     void calcRelativeVariance(unsigned int hist[3 * 256], double relativeVariance[3][bgfg::HISTOGRAM_BIN_COUNT])
@@ -526,15 +523,15 @@ namespace
 
         size_t total = all_contours.size();
 
-        _contours.create(total, 1, 0, -1, true);
+        _contours.create((int) total, 1, 0, -1, true);
 
         cv::SeqIterator<CvSeq*> it = all_contours.begin();
         for (size_t i = 0; i < total; ++i, ++it)
         {
             CvSeq* c = *it;
             ((CvContour*)c)->color = (int)i;
-            _contours.create((int)c->total, 1, CV_32SC2, i, true);
-            cv::Mat ci = _contours.getMat(i);
+            _contours.create((int)c->total, 1, CV_32SC2, (int)i, true);
+            cv::Mat ci = _contours.getMat((int)i);
             CV_Assert( ci.isContinuous() );
             cvCvtSeqToArray(c, ci.data);
         }
diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp
index 77ed46e159..8905eaed64 100644
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -830,20 +830,14 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Separable Linear Filter
 
-namespace cv { namespace gpu { namespace device
+namespace filter
 {
-    namespace row_filter
-    {
-        template <typename T, typename D>
-        void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-    }
+    template <typename T, typename D>
+    void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 
-    namespace column_filter
-    {
-        template <typename T, typename D>
-        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-    }
-}}}
+    template <typename T, typename D>
+    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
 
 namespace
 {
@@ -899,72 +893,56 @@ namespace
 
 Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
 {
-    using namespace ::cv::gpu::device::row_filter;
-
-    static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};
+    static const gpuFilter1D_t funcs[7][4] =
+    {
+        {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
+        {0, 0, 0, 0},
+        {filter::linearRow<ushort, float>, 0, filter::linearRow<ushort3, float3>, filter::linearRow<ushort4, float4>},
+        {filter::linearRow<short, float>, 0, filter::linearRow<short3, float3>, filter::linearRow<short4, float4>},
+        {filter::linearRow<int, float>, 0, filter::linearRow<int3, float3>, filter::linearRow<int4, float4>},
+        {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
+        {0, 0, 0, 0}
+    };
+    static const nppFilter1D_t npp_funcs[] =
+    {
+        0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R
+    };
 
     if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4))
     {
-        CV_Assert(borderType == BORDER_CONSTANT);
+        CV_Assert( borderType == BORDER_CONSTANT );
 
         GpuMat gpu_row_krnl;
         int nDivisor;
         normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nDivisor, true);
 
-        int ksize = gpu_row_krnl.cols;
+        const int ksize = gpu_row_krnl.cols;
         normalizeAnchor(anchor, ksize);
 
-        return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor,
-            nppFilter1D_callers[CV_MAT_CN(srcType)]));
+        return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor, npp_funcs[CV_MAT_CN(srcType)]));
     }
 
-    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
+    CV_Assert( borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
+
     int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
+    CV_Assert( tryConvertToGpuBorderType(borderType, gpuBorderType) );
 
-    CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 || srcType == CV_16SC3 || srcType == CV_32SC1 || srcType == CV_32FC1 || srcType == CV_32FC3 || srcType == CV_32FC4);
+    const int sdepth = CV_MAT_DEPTH(srcType);
+    const int cn = CV_MAT_CN(srcType);
+    CV_Assert( sdepth <= CV_64F && cn <= 4 );
+    CV_Assert( CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(bufType) == cn );
 
-    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType));
+    const gpuFilter1D_t func = funcs[sdepth][cn - 1];
+    CV_Assert( func != 0 );
 
     GpuMat gpu_row_krnl;
     normalizeKernel(rowKernel, gpu_row_krnl, CV_32F);
 
-    int ksize = gpu_row_krnl.cols;
-
-    CV_Assert(ksize > 0 && ksize <= 32);
+    const int ksize = gpu_row_krnl.cols;
+    CV_Assert( ksize > 0 && ksize <= 32 );
 
     normalizeAnchor(anchor, ksize);
 
-    gpuFilter1D_t func = 0;
-
-    switch (srcType)
-    {
-    case CV_8UC1:
-        func = linearRowFilter_gpu<uchar, float>;
-        break;
-    case CV_8UC3:
-        func = linearRowFilter_gpu<uchar3, float3>;
-        break;
-    case CV_8UC4:
-        func = linearRowFilter_gpu<uchar4, float4>;
-        break;
-    case CV_16SC3:
-        func = linearRowFilter_gpu<short3, float3>;
-        break;
-    case CV_32SC1:
-        func = linearRowFilter_gpu<int, float>;
-        break;
-    case CV_32FC1:
-        func = linearRowFilter_gpu<float, float>;
-        break;
-    case CV_32FC3:
-        func = linearRowFilter_gpu<float3, float3>;
-        break;
-    case CV_32FC4:
-        func = linearRowFilter_gpu<float4, float4>;
-        break;
-    }
-
     return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, gpu_row_krnl, func, gpuBorderType));
 }
 
@@ -1020,72 +998,56 @@ namespace
 
 Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
 {
-    using namespace ::cv::gpu::device::column_filter;
-
-    static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R};
+    static const gpuFilter1D_t funcs[7][4] =
+    {
+        {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
+        {0, 0, 0, 0},
+        {filter::linearColumn<float, ushort>, 0, filter::linearColumn<float3, ushort3>, filter::linearColumn<float4, ushort4>},
+        {filter::linearColumn<float, short>, 0, filter::linearColumn<float3, short3>, filter::linearColumn<float4, short4>},
+        {filter::linearColumn<float, int>, 0, filter::linearColumn<float3, int3>, filter::linearColumn<float4, int4>},
+        {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
+        {0, 0, 0, 0}
+    };
+    static const nppFilter1D_t npp_funcs[] =
+    {
+        0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R
+    };
 
     if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4))
     {
-        CV_Assert(borderType == BORDER_CONSTANT);
+        CV_Assert( borderType == BORDER_CONSTANT );
 
         GpuMat gpu_col_krnl;
         int nDivisor;
         normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nDivisor, true);
 
-        int ksize = gpu_col_krnl.cols;
+        const int ksize = gpu_col_krnl.cols;
         normalizeAnchor(anchor, ksize);
 
-        return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor,
-            nppFilter1D_callers[CV_MAT_CN(bufType)]));
+        return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor, npp_funcs[CV_MAT_CN(bufType)]));
     }
 
-    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
+    CV_Assert( borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
+
     int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
+    CV_Assert( tryConvertToGpuBorderType(borderType, gpuBorderType) );
 
-    CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC3 || dstType == CV_8UC4 || dstType == CV_16SC3 || dstType == CV_32SC1 || dstType == CV_32FC1 || dstType == CV_32FC3 || dstType == CV_32FC4);
+    const int ddepth = CV_MAT_DEPTH(dstType);
+    const int cn = CV_MAT_CN(dstType);
+    CV_Assert( ddepth <= CV_64F && cn <= 4 );
+    CV_Assert( CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(bufType) == cn );
 
-    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType));
+    gpuFilter1D_t func = funcs[ddepth][cn - 1];
+    CV_Assert( func != 0 );
 
     GpuMat gpu_col_krnl;
     normalizeKernel(columnKernel, gpu_col_krnl, CV_32F);
 
-    int ksize = gpu_col_krnl.cols;
-
+    const int ksize = gpu_col_krnl.cols;
     CV_Assert(ksize > 0 && ksize <= 32);
 
     normalizeAnchor(anchor, ksize);
 
-    gpuFilter1D_t func = 0;
-
-    switch (dstType)
-    {
-    case CV_8UC1:
-        func = linearColumnFilter_gpu<float, uchar>;
-        break;
-    case CV_8UC3:
-        func = linearColumnFilter_gpu<float3, uchar3>;
-        break;
-    case CV_8UC4:
-        func = linearColumnFilter_gpu<float4, uchar4>;
-        break;
-    case CV_16SC3:
-        func = linearColumnFilter_gpu<float3, short3>;
-        break;
-    case CV_32SC1:
-        func = linearColumnFilter_gpu<float, int>;
-        break;
-    case CV_32FC1:
-        func = linearColumnFilter_gpu<float, float>;
-        break;
-    case CV_32FC3:
-        func = linearColumnFilter_gpu<float3, float3>;
-        break;
-    case CV_32FC4:
-        func = linearColumnFilter_gpu<float4, float4>;
-        break;
-    }
-
     return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, gpu_col_krnl, func, gpuBorderType));
 }
 
diff --git a/modules/gpu/src/frame_queue.cpp b/modules/gpu/src/frame_queue.cpp
index 4ed6d0f147..0d93918732 100644
--- a/modules/gpu/src/frame_queue.cpp
+++ b/modules/gpu/src/frame_queue.cpp
@@ -42,7 +42,7 @@
 
 #include "frame_queue.h"
 
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
 
 cv::gpu::detail::FrameQueue::FrameQueue() :
     endOfDecode_(0),
diff --git a/modules/gpu/src/frame_queue.h b/modules/gpu/src/frame_queue.h
index 5c049a28e5..00c44a4015 100644
--- a/modules/gpu/src/frame_queue.h
+++ b/modules/gpu/src/frame_queue.h
@@ -46,7 +46,7 @@
 #include "precomp.hpp"
 #include "thread_wrappers.h"
 
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
 
 namespace cv { namespace gpu
 {
diff --git a/modules/gpu/src/gftt.cpp b/modules/gpu/src/gftt.cpp
index 0c8f165ae1..6bb73de750 100644
--- a/modules/gpu/src/gftt.cpp
+++ b/modules/gpu/src/gftt.cpp
@@ -68,9 +68,6 @@ void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat& image,
     CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
     CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
 
-    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
     ensureSizeIsEnough(image.size(), CV_32F, eig_);
 
     if (useHarrisDetector)
diff --git a/modules/gpu/src/gpu_init.cpp b/modules/gpu/src/gpu_init.cpp
new file mode 100644
index 0000000000..cffacb833e
--- /dev/null
+++ b/modules/gpu/src/gpu_init.cpp
@@ -0,0 +1,59 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <precomp.hpp>
+
+namespace cv { namespace gpu
+{
+
+CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
+                  obj.info()->addParam(obj, "minScale", obj.minScale);
+                  obj.info()->addParam(obj, "maxScale", obj.maxScale);
+                  obj.info()->addParam(obj, "scales",   obj.scales));
+
+bool initModule_gpu(void)
+{
+    Ptr<Algorithm> sc = createSCascade();
+    return sc->info() != 0;
+}
+
+} }
\ No newline at end of file
diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp
index 6e2170d114..09cf01850e 100644
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@@ -52,6 +52,8 @@ void cv::gpu::HoughLines(const GpuMat&, GpuMat&, float, float, int, bool, int) {
 void cv::gpu::HoughLines(const GpuMat&, GpuMat&, HoughLinesBuf&, float, float, int, bool, int) { throw_nogpu(); }
 void cv::gpu::HoughLinesDownload(const GpuMat&, OutputArray, OutputArray) { throw_nogpu(); }
 
+void cv::gpu::HoughLinesP(const GpuMat&, GpuMat&, HoughLinesBuf&, float, float, int, int, int) { throw_nogpu(); }
+
 void cv::gpu::HoughCircles(const GpuMat&, GpuMat&, int, float, float, int, int, int, int, int) { throw_nogpu(); }
 void cv::gpu::HoughCircles(const GpuMat&, GpuMat&, HoughCirclesBuf&, int, float, float, int, int, int, int, int) { throw_nogpu(); }
 void cv::gpu::HoughCirclesDownload(const GpuMat&, OutputArray) { throw_nogpu(); }
@@ -155,6 +157,55 @@ void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, Ou
     }
 }
 
+//////////////////////////////////////////////////////////
+// HoughLinesP
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace hough
+    {
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength);
+    }
+}}}
+
+void cv::gpu::HoughLinesP(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int minLineLength, int maxLineGap, int maxLines)
+{
+    using namespace cv::gpu::device::hough;
+
+    CV_Assert( src.type() == CV_8UC1 );
+    CV_Assert( src.cols < std::numeric_limits<unsigned short>::max() );
+    CV_Assert( src.rows < std::numeric_limits<unsigned short>::max() );
+
+    ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf.list);
+    unsigned int* srcPoints = buf.list.ptr<unsigned int>();
+
+    const int pointsCount = buildPointList_gpu(src, srcPoints);
+    if (pointsCount == 0)
+    {
+        lines.release();
+        return;
+    }
+
+    const int numangle = cvRound(CV_PI / theta);
+    const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho);
+    CV_Assert( numangle > 0 && numrho > 0 );
+
+    ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, buf.accum);
+    buf.accum.setTo(Scalar::all(0));
+
+    DeviceInfo devInfo;
+    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
+
+    ensureSizeIsEnough(1, maxLines, CV_32SC4, lines);
+
+    int linesCount = houghLinesProbabilistic_gpu(src, buf.accum, lines.ptr<int4>(), maxLines, rho, theta, maxLineGap, minLineLength);
+
+    if (linesCount > 0)
+        lines.cols = linesCount;
+    else
+        lines.release();
+}
+
 //////////////////////////////////////////////////////////
 // HoughCircles
 
@@ -292,9 +343,8 @@ void cv::gpu::HoughCircles(const GpuMat& src, GpuMat& circles, HoughCirclesBuf&
 
     ensureSizeIsEnough(1, maxCircles, CV_32FC3, circles);
 
-    DeviceInfo devInfo;
     const int circlesCount = circlesAccumRadius_gpu(centers, centersCount, srcPoints, pointsCount, circles.ptr<float3>(), maxCircles,
-                                                    dp, minRadius, maxRadius, votesThreshold, devInfo.supports(FEATURE_SET_COMPUTE_20));
+                                                    dp, minRadius, maxRadius, votesThreshold, deviceSupports(FEATURE_SET_COMPUTE_20));
 
     if (circlesCount > 0)
         circles.cols = circlesCount;
@@ -529,7 +579,7 @@ namespace
         const func_t func = funcs[dx.depth()];
         CV_Assert(func != 0);
 
-        edgePointList.cols = edgePointList.step / sizeof(int);
+        edgePointList.cols = (int) (edgePointList.step / sizeof(int));
         ensureSizeIsEnough(2, edges.size().area(), CV_32SC1, edgePointList);
 
         edgePointList.cols = func(edges, dx, dy, edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1));
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
new file mode 100644
index 0000000000..e4e3f9416e
--- /dev/null
+++ b/modules/gpu/src/icf.hpp
@@ -0,0 +1,154 @@
+//M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M
+
+
+#ifndef __OPENCV_ICF_HPP__
+#define __OPENCV_ICF_HPP__
+
+#include <opencv2/gpu/device/common.hpp>
+
+#if defined __CUDACC__
+# define __device __device__ __forceinline__
+#else
+# define __device
+#endif
+
+
+namespace cv { namespace gpu { namespace device {
+namespace icf {
+
+struct Octave
+{
+    ushort index;
+    ushort stages;
+    ushort shrinkage;
+    ushort2 size;
+    float scale;
+
+    Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
+    : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
+};
+
+struct Level //is actually 24 bytes
+{
+    int octave;
+    int step;
+
+    float relScale;
+    float scaling[2]; // calculated according to Dollal paper
+
+    // for 640x480 we can not get overflow
+    uchar2 workRect;
+    uchar2 objSize;
+
+    Level(int idx, const Octave& oct, const float scale, const int w, const int h);
+    __device Level(){}
+};
+
+struct Node
+{
+    uchar4 rect;
+    // ushort channel;
+    unsigned int threshold;
+
+    enum { THRESHOLD_MASK = 0x0FFFFFFF };
+
+    Node(const uchar4 r, const unsigned int ch, const unsigned int t) : rect(r), threshold(t + (ch << 28)) {}
+};
+
+struct Detection
+{
+    ushort x;
+    ushort y;
+    ushort w;
+    ushort h;
+
+    float confidence;
+    int kind;
+
+    Detection(){}
+    __device Detection(int _x, int _y, uchar _w, uchar _h, float c)
+    : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
+};
+
+struct GK107PolicyX4
+{
+    enum {WARP = 32, STA_X = WARP, STA_Y = 8, SHRINKAGE = 4};
+    typedef float2 roi_type;
+    static const dim3 block()
+    {
+        return dim3(STA_X, STA_Y);
+    }
+};
+
+template<typename Policy>
+struct CascadeInvoker
+{
+    CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {}
+
+    CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzf& _stages,
+                   const PtrStepSzb& _nodes,  const PtrStepSzf& _leaves)
+    : levels((const Level*)_levels.ptr()),
+      stages((const float*)_stages.ptr()),
+      nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()),
+      scales(_levels.cols / sizeof(Level))
+    {}
+
+    const Level*  levels;
+    const float*  stages;
+
+    const Node*   nodes;
+    const float*  leaves;
+
+    int scales;
+
+    void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+        const int downscales, const cudaStream_t& stream = 0) const;
+
+    template<bool isUp>
+    __device void detect(Detection* objects, const unsigned int ndetections, unsigned int* ctr, const int downscales) const;
+};
+
+}
+}}}
+
+#endif
\ No newline at end of file
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index 7f63737efc..283ca7cd77 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -71,9 +71,7 @@ void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&, GpuMat&, Stream&)
 void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, Stream&) { throw_nogpu(); }
 void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::calcHist(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::calcHist(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }
 void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }
@@ -91,7 +89,6 @@ void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_n
 void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, double, bool) { throw_nogpu(); }
-cv::gpu::CannyBuf::CannyBuf(const GpuMat&, const GpuMat&) { throw_nogpu(); }
 void cv::gpu::CannyBuf::create(const Size&, int) { throw_nogpu(); }
 void cv::gpu::CannyBuf::release() { throw_nogpu(); }
 
@@ -547,14 +544,13 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    DeviceInfo info;
     cv::Size whole;
     cv::Point offset;
 
     src.locateROI(whole, offset);
 
-    if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
-        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (src.step - offset.x))
+    if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
+        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
     {
         ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
 
@@ -972,36 +968,20 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4
     hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
 }
 
-namespace cv { namespace gpu { namespace device
+namespace hist
 {
-    namespace hist
-    {
-        void histogram256_gpu(PtrStepSzb src, int* hist, unsigned int* buf, cudaStream_t stream);
-
-        const int PARTIAL_HISTOGRAM256_COUNT = 240;
-        const int HISTOGRAM256_BIN_COUNT     = 256;
-
-        void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
-    }
-}}}
+    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
+}
 
 void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
 {
-    GpuMat buf;
-    calcHist(src, hist, buf, stream);
-}
-
-void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream)
-{
-    using namespace ::cv::gpu::device::hist;
-
     CV_Assert(src.type() == CV_8UC1);
 
     hist.create(1, 256, CV_32SC1);
+    hist.setTo(Scalar::all(0));
 
-    ensureSizeIsEnough(1, PARTIAL_HISTOGRAM256_COUNT * HISTOGRAM256_BIN_COUNT, CV_32SC1, buf);
-
-    histogram256_gpu(src, hist.ptr<int>(), buf.ptr<unsigned int>(), StreamAccessor::getStream(stream));
+    hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream)
@@ -1011,16 +991,8 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream)
     equalizeHist(src, dst, hist, buf, stream);
 }
 
-void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream)
-{
-    GpuMat buf;
-    equalizeHist(src, dst, hist, buf, stream);
-}
-
 void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s)
 {
-    using namespace ::cv::gpu::device::hist;
-
     CV_Assert(src.type() == CV_8UC1);
 
     dst.create(src.size(), src.type());
@@ -1028,15 +1000,12 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
     int intBufSize;
     nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) );
 
-    int bufSize = static_cast<int>(std::max(256 * 240 * sizeof(int), intBufSize + 256 * sizeof(int)));
+    ensureSizeIsEnough(1, intBufSize + 256 * sizeof(int), CV_8UC1, buf);
 
-    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
-
-    GpuMat histBuf(1, 256 * 240, CV_32SC1, buf.ptr());
     GpuMat intBuf(1, intBufSize, CV_8UC1, buf.ptr());
     GpuMat lut(1, 256, CV_32S, buf.ptr() + intBufSize);
 
-    calcHist(src, hist, histBuf, s);
+    calcHist(src, hist, s);
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
@@ -1044,10 +1013,7 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
 
     nppSafeCall( nppsIntegral_32s(hist.ptr<Npp32s>(), lut.ptr<Npp32s>(), 256, intBuf.ptr<Npp8u>()) );
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-
-    equalizeHist_gpu(src, dst, lut.ptr<int>(), stream);
+    hist::equalizeHist(src, dst, lut.ptr<int>(), stream);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1448,133 +1414,118 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
 //////////////////////////////////////////////////////////////////////////////
 // Canny
 
-cv::gpu::CannyBuf::CannyBuf(const GpuMat& dx_, const GpuMat& dy_) : dx(dx_), dy(dy_)
-{
-    CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size());
-
-    create(dx_.size(), -1);
-}
-
 void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
 {
-    ensureSizeIsEnough(image_size, CV_32SC1, dx);
-    ensureSizeIsEnough(image_size, CV_32SC1, dy);
+    if (apperture_size > 0)
+    {
+        ensureSizeIsEnough(image_size, CV_32SC1, dx);
+        ensureSizeIsEnough(image_size, CV_32SC1, dy);
 
-    if (apperture_size == 3)
-    {
-        ensureSizeIsEnough(image_size, CV_32SC1, dx_buf);
-        ensureSizeIsEnough(image_size, CV_32SC1, dy_buf);
-    }
-    else if(apperture_size > 0)
-    {
-        if (!filterDX)
+        if (apperture_size != 3)
+        {
             filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
-        if (!filterDY)
             filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
+        }
     }
 
-    ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, edgeBuf);
+    ensureSizeIsEnough(image_size, CV_32FC1, mag);
+    ensureSizeIsEnough(image_size, CV_32SC1, map);
 
-    ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1);
-    ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2);
+    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1);
+    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2);
 }
 
 void cv::gpu::CannyBuf::release()
 {
     dx.release();
     dy.release();
-    dx_buf.release();
-    dy_buf.release();
-    edgeBuf.release();
-    trackBuf1.release();
-    trackBuf2.release();
+    mag.release();
+    map.release();
+    st1.release();
+    st2.release();
 }
 
-namespace cv { namespace gpu { namespace device
+namespace canny
 {
-    namespace canny
-    {
-        void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
 
-        void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
-        void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);
 
-        void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh);
+    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1);
 
-        void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols);
+    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2);
 
-        void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);
-
-        void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);
-    }
-}}}
+    void getEdges(PtrStepSzi map, PtrStepSzb dst);
+}
 
 namespace
 {
-    void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
+    void CannyCaller(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
     {
-        using namespace ::cv::gpu::device::canny;
+        using namespace canny;
 
-        calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);
+        buf.map.setTo(Scalar::all(0));
+        calcMap(dx, dy, buf.mag, buf.map, low_thresh, high_thresh);
 
-        edgesHysteresisLocal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), dst.rows, dst.cols);
+        edgesHysteresisLocal(buf.map, buf.st1.ptr<ushort2>());
 
-        edgesHysteresisGlobal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), buf.trackBuf2.ptr<ushort2>(), dst.rows, dst.cols);
+        edgesHysteresisGlobal(buf.map, buf.st1.ptr<ushort2>(), buf.st2.ptr<ushort2>());
 
-        getEdges_gpu(buf.edgeBuf, dst, dst.rows, dst.cols);
+        getEdges(buf.map, dst);
     }
 }
 
 void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
 {
-    CannyBuf buf(src.size(), apperture_size);
+    CannyBuf buf;
     Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient);
 }
 
 void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
 {
-    using namespace ::cv::gpu::device::canny;
+    using namespace canny;
 
     CV_Assert(src.type() == CV_8UC1);
 
-    if (!TargetArchs::builtWith(SHARED_ATOMICS) || !DeviceInfo().supports(SHARED_ATOMICS))
+    if (!deviceSupports(SHARED_ATOMICS))
         CV_Error(CV_StsNotImplemented, "The device doesn't support shared atomics");
 
     if( low_thresh > high_thresh )
         std::swap( low_thresh, high_thresh);
 
     dst.create(src.size(), CV_8U);
-    dst.setTo(Scalar::all(0));
-
     buf.create(src.size(), apperture_size);
-    buf.edgeBuf.setTo(Scalar::all(0));
 
     if (apperture_size == 3)
     {
-        calcSobelRowPass_gpu(src, buf.dx_buf, buf.dy_buf, src.rows, src.cols);
+        Size wholeSize;
+        Point ofs;
+        src.locateROI(wholeSize, ofs);
+        GpuMat srcWhole(wholeSize, src.type(), src.datastart, src.step);
 
-        calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
+        calcMagnitude(srcWhole, ofs.x, ofs.y, buf.dx, buf.dy, buf.mag, L2gradient);
     }
     else
     {
         buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
         buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
 
-        calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
+        calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient);
     }
 
-    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+    CannyCaller(buf.dx, buf.dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
 }
 
 void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
 {
-    CannyBuf buf(dx, dy);
+    CannyBuf buf;
     Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient);
 }
 
 void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
 {
-    using namespace ::cv::gpu::device::canny;
+    using namespace canny;
 
     CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
     CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
@@ -1583,17 +1534,11 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& d
         std::swap( low_thresh, high_thresh);
 
     dst.create(dx.size(), CV_8U);
-    dst.setTo(Scalar::all(0));
-
-    buf.dx = dx; buf.dy = dy;
     buf.create(dx.size(), -1);
-    buf.edgeBuf.setTo(Scalar::all(0));
 
-    calcMagnitude_gpu(dx, dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient);
+    calcMagnitude(dx, dy, buf.mag, L2gradient);
 
-    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+    CannyCaller(dx, dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
 }
 
 #endif /* !defined (HAVE_CUDA) */
-
-
diff --git a/modules/gpu/src/matrix_reductions.cpp b/modules/gpu/src/matrix_reductions.cpp
index 0f1cf5ea87..4295644c7a 100644
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -118,7 +118,7 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat
 {
     CV_Assert(src.type() == CV_8UC1);
 
-    if (!TargetArchs::builtWith(FEATURE_SET_COMPUTE_13) || !DeviceInfo().supports(FEATURE_SET_COMPUTE_13))
+    if (!deviceSupports(FEATURE_SET_COMPUTE_13))
         CV_Error(CV_StsNotImplemented, "Not sufficient compute capebility");
 
     NppiSize sz;
@@ -204,34 +204,19 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
 ////////////////////////////////////////////////////////////////////////
 // Sum
 
-namespace cv { namespace gpu { namespace device
+namespace sum
 {
-    namespace matrix_reductions
-    {
-        namespace sum
-        {
-            template <typename T>
-            void sumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
+    void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows);
 
-            template <typename T>
-            void sumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
+    template <typename T, int cn>
+    void run(PtrStepSzb src, void* buf, double* sum);
 
-            template <typename T>
-            void absSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
+    template <typename T, int cn>
+    void runAbs(PtrStepSzb src, void* buf, double* sum);
 
-            template <typename T>
-            void absSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
-
-            template <typename T>
-            void sqrSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
-
-            template <typename T>
-            void sqrSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
-
-            void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
-        }
-    }
-}}}
+    template <typename T, int cn>
+    void runSqr(PtrStepSzb src, void* buf, double* sum);
+}
 
 Scalar cv::gpu::sum(const GpuMat& src)
 {
@@ -239,159 +224,127 @@ Scalar cv::gpu::sum(const GpuMat& src)
     return sum(src, buf);
 }
 
-
 Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace cv::gpu::device::matrix_reductions::sum;
-
-    typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
-
-    static Caller multipass_callers[] =
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+    static const func_t funcs[7][5] =
     {
-        sumMultipassCaller<unsigned char>, sumMultipassCaller<char>,
-        sumMultipassCaller<unsigned short>, sumMultipassCaller<short>,
-        sumMultipassCaller<int>, sumMultipassCaller<float>
+        {0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
+        {0, ::sum::run<schar , 1>, ::sum::run<schar , 2>, ::sum::run<schar , 3>, ::sum::run<schar , 4>},
+        {0, ::sum::run<ushort, 1>, ::sum::run<ushort, 2>, ::sum::run<ushort, 3>, ::sum::run<ushort, 4>},
+        {0, ::sum::run<short , 1>, ::sum::run<short , 2>, ::sum::run<short , 3>, ::sum::run<short , 4>},
+        {0, ::sum::run<int   , 1>, ::sum::run<int   , 2>, ::sum::run<int   , 3>, ::sum::run<int   , 4>},
+        {0, ::sum::run<float , 1>, ::sum::run<float , 2>, ::sum::run<float , 3>, ::sum::run<float , 4>},
+        {0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
     };
 
-    static Caller singlepass_callers[] = {
-        sumCaller<unsigned char>, sumCaller<char>,
-        sumCaller<unsigned short>, sumCaller<short>,
-        sumCaller<int>, sumCaller<float>
-    };
-
-    CV_Assert(src.depth() <= CV_32F);
+    if (src.depth() == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
 
     Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
+    ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
     ensureSizeIsEnough(buf_size, CV_8U, buf);
+    buf.setTo(Scalar::all(0));
 
-    Caller* callers = multipass_callers;
-    if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-        callers = singlepass_callers;
-
-    Caller caller = callers[src.depth()];
+    const func_t func = funcs[src.depth()][src.channels()];
 
     double result[4];
-    caller(src, buf, result, src.channels());
+    func(src, buf.data, result);
+
     return Scalar(result[0], result[1], result[2], result[3]);
 }
 
-
 Scalar cv::gpu::absSum(const GpuMat& src)
 {
     GpuMat buf;
     return absSum(src, buf);
 }
 
-
 Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace cv::gpu::device::matrix_reductions::sum;
-
-    typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
-
-    static Caller multipass_callers[] =
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+    static const func_t funcs[7][5] =
     {
-        absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>,
-        absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>,
-        absSumMultipassCaller<int>, absSumMultipassCaller<float>
+        {0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
+        {0, ::sum::runAbs<schar , 1>, ::sum::runAbs<schar , 2>, ::sum::runAbs<schar , 3>, ::sum::runAbs<schar , 4>},
+        {0, ::sum::runAbs<ushort, 1>, ::sum::runAbs<ushort, 2>, ::sum::runAbs<ushort, 3>, ::sum::runAbs<ushort, 4>},
+        {0, ::sum::runAbs<short , 1>, ::sum::runAbs<short , 2>, ::sum::runAbs<short , 3>, ::sum::runAbs<short , 4>},
+        {0, ::sum::runAbs<int   , 1>, ::sum::runAbs<int   , 2>, ::sum::runAbs<int   , 3>, ::sum::runAbs<int   , 4>},
+        {0, ::sum::runAbs<float , 1>, ::sum::runAbs<float , 2>, ::sum::runAbs<float , 3>, ::sum::runAbs<float , 4>},
+        {0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
     };
 
-    static Caller singlepass_callers[] =
+    if (src.depth() == CV_64F)
     {
-        absSumCaller<unsigned char>, absSumCaller<char>,
-        absSumCaller<unsigned short>, absSumCaller<short>,
-        absSumCaller<int>, absSumCaller<float>
-    };
-
-    CV_Assert(src.depth() <= CV_32F);
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
 
     Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
+    ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
     ensureSizeIsEnough(buf_size, CV_8U, buf);
+    buf.setTo(Scalar::all(0));
 
-    Caller* callers = multipass_callers;
-    if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-        callers = singlepass_callers;
-
-    Caller caller = callers[src.depth()];
+    const func_t func = funcs[src.depth()][src.channels()];
 
     double result[4];
-    caller(src, buf, result, src.channels());
+    func(src, buf.data, result);
+
     return Scalar(result[0], result[1], result[2], result[3]);
 }
 
-
 Scalar cv::gpu::sqrSum(const GpuMat& src)
 {
     GpuMat buf;
     return sqrSum(src, buf);
 }
 
-
 Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace cv::gpu::device::matrix_reductions::sum;
-
-    typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
-
-    static Caller multipass_callers[] =
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+    static const func_t funcs[7][5] =
     {
-        sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>,
-        sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>,
-        sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>
+        {0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
+        {0, ::sum::runSqr<schar , 1>, ::sum::runSqr<schar , 2>, ::sum::runSqr<schar , 3>, ::sum::runSqr<schar , 4>},
+        {0, ::sum::runSqr<ushort, 1>, ::sum::runSqr<ushort, 2>, ::sum::runSqr<ushort, 3>, ::sum::runSqr<ushort, 4>},
+        {0, ::sum::runSqr<short , 1>, ::sum::runSqr<short , 2>, ::sum::runSqr<short , 3>, ::sum::runSqr<short , 4>},
+        {0, ::sum::runSqr<int   , 1>, ::sum::runSqr<int   , 2>, ::sum::runSqr<int   , 3>, ::sum::runSqr<int   , 4>},
+        {0, ::sum::runSqr<float , 1>, ::sum::runSqr<float , 2>, ::sum::runSqr<float , 3>, ::sum::runSqr<float , 4>},
+        {0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
     };
 
-    static Caller singlepass_callers[7] =
+    if (src.depth() == CV_64F)
     {
-        sqrSumCaller<unsigned char>, sqrSumCaller<char>,
-        sqrSumCaller<unsigned short>, sqrSumCaller<short>,
-        sqrSumCaller<int>, sqrSumCaller<float>
-    };
-
-    CV_Assert(src.depth() <= CV_32F);
-
-    Caller* callers = multipass_callers;
-    if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-        callers = singlepass_callers;
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
 
     Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
+    ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
     ensureSizeIsEnough(buf_size, CV_8U, buf);
+    buf.setTo(Scalar::all(0));
 
-    Caller caller = callers[src.depth()];
+    const func_t func = funcs[src.depth()][src.channels()];
 
     double result[4];
-    caller(src, buf, result, src.channels());
+    func(src, buf.data, result);
+
     return Scalar(result[0], result[1], result[2], result[3]);
 }
 
 ////////////////////////////////////////////////////////////////////////
-// Find min or max
+// minMax
 
-namespace cv { namespace gpu { namespace device
+namespace minMax
 {
-    namespace matrix_reductions
-    {
-        namespace minmax
-        {
-            void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
-
-            template <typename T>
-            void minMaxCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf);
-
-            template <typename T>
-            void minMaxMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-
-            template <typename T>
-            void minMaxMultipassCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf);
-
-            template <typename T>
-            void minMaxMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-        }
-    }
-}}}
+    void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
 
+    template <typename T>
+    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+}
 
 void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
 {
@@ -399,112 +352,49 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
     minMax(src, minVal, maxVal, mask, buf);
 }
 
-
 void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::minmax;
-
-    typedef void (*Caller)(const PtrStepSzb, double*, double*, PtrStepb);
-    typedef void (*MaskedCaller)(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-
-    static Caller multipass_callers[] =
+    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    static const func_t funcs[] =
     {
-        minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>,
-        minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>,
-        minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0
+        ::minMax::run<uchar>,
+        ::minMax::run<schar>,
+        ::minMax::run<ushort>,
+        ::minMax::run<short>,
+        ::minMax::run<int>,
+        ::minMax::run<float>,
+        ::minMax::run<double>
     };
 
-    static Caller singlepass_callers[] =
-    {
-        minMaxCaller<unsigned char>, minMaxCaller<char>,
-        minMaxCaller<unsigned short>, minMaxCaller<short>,
-        minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double>
-    };
-
-    static MaskedCaller masked_multipass_callers[] =
-    {
-        minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>,
-        minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
-        minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0
-    };
-
-    static MaskedCaller masked_singlepass_callers[] =
-    {
-        minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>,
-        minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>,
-        minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double>
-    };
-
-    CV_Assert(src.depth() <= CV_64F);
-    CV_Assert(src.channels() == 1);
-    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
 
     if (src.depth() == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    double minVal_; if (!minVal) minVal = &minVal_;
-    double maxVal_; if (!maxVal) maxVal = &maxVal_;
-
     Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), buf_size.width, buf_size.height);
+    ::minMax::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
     ensureSizeIsEnough(buf_size, CV_8U, buf);
 
-    if (mask.empty())
-    {
-        Caller* callers = multipass_callers;
-        if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-            callers = singlepass_callers;
+    const func_t func = funcs[src.depth()];
 
-        Caller caller = callers[src.type()];
-        CV_Assert(caller != 0);
-        caller(src, minVal, maxVal, buf);
-    }
-    else
-    {
-        MaskedCaller* callers = masked_multipass_callers;
-        if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-            callers = masked_singlepass_callers;
-
-        MaskedCaller caller = callers[src.type()];
-        CV_Assert(caller != 0);
-        caller(src, mask, minVal, maxVal, buf);
-    }
+    double temp1, temp2;
+    func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf);
 }
 
-
 ////////////////////////////////////////////////////////////////////////
-// Locate min and max
+// minMaxLoc
 
-namespace cv { namespace gpu { namespace device
+namespace minMaxLoc
 {
-    namespace matrix_reductions
-    {
-        namespace minmaxloc
-        {
-            void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
-                                    int& b1rows, int& b2cols, int& b2rows);
+    void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows);
 
-            template <typename T>
-            void minMaxLocCaller(const PtrStepSzb src, double* minval, double* maxval,
-                                 int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
-
-            template <typename T>
-            void minMaxLocMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
-                                     int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
-
-            template <typename T>
-            void minMaxLocMultipassCaller(const PtrStepSzb src, double* minval, double* maxval,
-                                          int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
-
-            template <typename T>
-            void minMaxLocMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
-                                              int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
-        }
-    }
-}}}
+    template <typename T>
+    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+}
 
 void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
 {
@@ -515,104 +405,49 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
 void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
                         const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::minmaxloc;
-
-    typedef void (*Caller)(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    typedef void (*MaskedCaller)(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-
-    static Caller multipass_callers[] =
+    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    static const func_t funcs[] =
     {
-        minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>,
-        minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>,
-        minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0
+        ::minMaxLoc::run<uchar>,
+        ::minMaxLoc::run<schar>,
+        ::minMaxLoc::run<ushort>,
+        ::minMaxLoc::run<short>,
+        ::minMaxLoc::run<int>,
+        ::minMaxLoc::run<float>,
+        ::minMaxLoc::run<double>
     };
 
-    static Caller singlepass_callers[] =
-    {
-        minMaxLocCaller<unsigned char>, minMaxLocCaller<char>,
-        minMaxLocCaller<unsigned short>, minMaxLocCaller<short>,
-        minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double>
-    };
-
-    static MaskedCaller masked_multipass_callers[] =
-    {
-        minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>,
-        minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>,
-        minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0
-    };
-
-    static MaskedCaller masked_singlepass_callers[] =
-    {
-        minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>,
-        minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>,
-        minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double>
-    };
-
-    CV_Assert(src.depth() <= CV_64F);
-    CV_Assert(src.channels() == 1);
-    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
 
     if (src.depth() == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    double minVal_; if (!minVal) minVal = &minVal_;
-    double maxVal_; if (!maxVal) maxVal = &maxVal_;
-    int minLoc_[2];
-    int maxLoc_[2];
-
     Size valbuf_size, locbuf_size;
-    getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), valbuf_size.width,
-                       valbuf_size.height, locbuf_size.width, locbuf_size.height);
+    ::minMaxLoc::getBufSize(src.cols, src.rows, src.elemSize(), valbuf_size.width, valbuf_size.height, locbuf_size.width, locbuf_size.height);
     ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
     ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
 
-    if (mask.empty())
-    {
-        Caller* callers = multipass_callers;
-        if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-            callers = singlepass_callers;
+    const func_t func = funcs[src.depth()];
 
-        Caller caller = callers[src.type()];
-        CV_Assert(caller != 0);
-        caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
-    }
-    else
-    {
-        MaskedCaller* callers = masked_multipass_callers;
-        if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-            callers = masked_singlepass_callers;
-
-        MaskedCaller caller = callers[src.type()];
-        CV_Assert(caller != 0);
-        caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
-    }
-
-    if (minLoc) { minLoc->x = minLoc_[0]; minLoc->y = minLoc_[1]; }
-    if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; }
+    double temp1, temp2;
+    Point temp3, temp4;
+    func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, minLoc ? &minLoc->x : &temp3.x, maxLoc ? &maxLoc->x : &temp4.x, valBuf, locBuf);
 }
 
 //////////////////////////////////////////////////////////////////////////////
-// Count non-zero elements
+// countNonZero
 
-namespace cv { namespace gpu { namespace device
+namespace countNonZero
 {
-    namespace matrix_reductions
-    {
-        namespace countnonzero
-        {
-            void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);
+    void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
 
-            template <typename T>
-            int countNonZeroCaller(const PtrStepSzb src, PtrStepb buf);
-
-            template <typename T>
-            int countNonZeroMultipassCaller(const PtrStepSzb src, PtrStepb buf);
-        }
-    }
-}}}
+    template <typename T>
+    int run(const PtrStepSzb src, PtrStep<unsigned int> buf);
+}
 
 int cv::gpu::countNonZero(const GpuMat& src)
 {
@@ -620,198 +455,213 @@ int cv::gpu::countNonZero(const GpuMat& src)
     return countNonZero(src, buf);
 }
 
-
 int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::countnonzero;
-
-    typedef int (*Caller)(const PtrStepSzb src, PtrStepb buf);
-
-    static Caller multipass_callers[7] =
+    typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    static const func_t funcs[] =
     {
-        countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>,
-        countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>,
-        countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0
+        ::countNonZero::run<uchar>,
+        ::countNonZero::run<schar>,
+        ::countNonZero::run<ushort>,
+        ::countNonZero::run<short>,
+        ::countNonZero::run<int>,
+        ::countNonZero::run<float>,
+        ::countNonZero::run<double>
     };
 
-    static Caller singlepass_callers[7] =
-    {
-        countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,
-        countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,
-        countNonZeroCaller<int>, countNonZeroCaller<float>, countNonZeroCaller<double> };
-
-    CV_Assert(src.depth() <= CV_64F);
     CV_Assert(src.channels() == 1);
 
     if (src.depth() == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
     Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height);
+    ::countNonZero::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
     ensureSizeIsEnough(buf_size, CV_8U, buf);
 
-    Caller* callers = multipass_callers;
-    if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-        callers = singlepass_callers;
+    const func_t func = funcs[src.depth()];
 
-    Caller caller = callers[src.type()];
-    CV_Assert(caller != 0);
-    return caller(src, buf);
+    return func(src, buf);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // reduce
 
-namespace cv { namespace gpu { namespace device
+namespace reduce
 {
-    namespace matrix_reductions
-    {
-        template <typename T, typename S, typename D> void reduceRows_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template <typename T, typename S, typename D> void reduceCols_gpu(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-    }
-}}}
+    template <typename T, typename S, typename D>
+    void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    template <typename T, typename S, typename D>
+    void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+}
 
 void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
 {
-    using namespace ::cv::gpu::device::matrix_reductions;
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F);
-    CV_Assert(dim == 0 || dim == 1);
-    CV_Assert(reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN);
+    CV_Assert( src.channels() <= 4 );
+    CV_Assert( dim == 0 || dim == 1 );
+    CV_Assert( reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN );
 
     if (dtype < 0)
         dtype = src.depth();
 
-    dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKETYPE(dtype, src.channels()));
+    dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
 
     if (dim == 0)
     {
-        typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-
-        static const caller_t callers[6][6] =
+        typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+        static const func_t funcs[7][7] =
         {
             {
-                reduceRows_gpu<unsigned char, int, unsigned char>,
-                0/*reduceRows_gpu<unsigned char, int, signed char>*/,
-                0/*reduceRows_gpu<unsigned char, int, unsigned short>*/,
-                0/*reduceRows_gpu<unsigned char, int, short>*/,
-                reduceRows_gpu<unsigned char, int, int>,
-                reduceRows_gpu<unsigned char, int, float>
+                ::reduce::rows<unsigned char, int, unsigned char>,
+                0/*::reduce::rows<unsigned char, int, signed char>*/,
+                0/*::reduce::rows<unsigned char, int, unsigned short>*/,
+                0/*::reduce::rows<unsigned char, int, short>*/,
+                ::reduce::rows<unsigned char, int, int>,
+                ::reduce::rows<unsigned char, float, float>,
+                ::reduce::rows<unsigned char, double, double>
             },
             {
-                0/*reduceRows_gpu<signed char, int, unsigned char>*/,
-                0/*reduceRows_gpu<signed char, int, signed char>*/,
-                0/*reduceRows_gpu<signed char, int, unsigned short>*/,
-                0/*reduceRows_gpu<signed char, int, short>*/,
-                0/*reduceRows_gpu<signed char, int, int>*/,
-                0/*reduceRows_gpu<signed char, int, float>*/
+                0/*::reduce::rows<signed char, int, unsigned char>*/,
+                0/*::reduce::rows<signed char, int, signed char>*/,
+                0/*::reduce::rows<signed char, int, unsigned short>*/,
+                0/*::reduce::rows<signed char, int, short>*/,
+                0/*::reduce::rows<signed char, int, int>*/,
+                0/*::reduce::rows<signed char, float, float>*/,
+                0/*::reduce::rows<signed char, double, double>*/
             },
             {
-                0/*reduceRows_gpu<unsigned short, int, unsigned char>*/,
-                0/*reduceRows_gpu<unsigned short, int, signed char>*/,
-                reduceRows_gpu<unsigned short, int, unsigned short>,
-                0/*reduceRows_gpu<unsigned short, int, short>*/,
-                reduceRows_gpu<unsigned short, int, int>,
-                reduceRows_gpu<unsigned short, int, float>
+                0/*::reduce::rows<unsigned short, int, unsigned char>*/,
+                0/*::reduce::rows<unsigned short, int, signed char>*/,
+                ::reduce::rows<unsigned short, int, unsigned short>,
+                0/*::reduce::rows<unsigned short, int, short>*/,
+                ::reduce::rows<unsigned short, int, int>,
+                ::reduce::rows<unsigned short, float, float>,
+                ::reduce::rows<unsigned short, double, double>
             },
             {
-                0/*reduceRows_gpu<short, int, unsigned char>*/,
-                0/*reduceRows_gpu<short, int, signed char>*/,
-                0/*reduceRows_gpu<short, int, unsigned short>*/,
-                reduceRows_gpu<short, int, short>,
-                reduceRows_gpu<short, int, int>,
-                reduceRows_gpu<short, int, float>
+                0/*::reduce::rows<short, int, unsigned char>*/,
+                0/*::reduce::rows<short, int, signed char>*/,
+                0/*::reduce::rows<short, int, unsigned short>*/,
+                ::reduce::rows<short, int, short>,
+                ::reduce::rows<short, int, int>,
+                ::reduce::rows<short, float, float>,
+                ::reduce::rows<short, double, double>
             },
             {
-                0/*reduceRows_gpu<int, int, unsigned char>*/,
-                0/*reduceRows_gpu<int, int, signed char>*/,
-                0/*reduceRows_gpu<int, int, unsigned short>*/,
-                0/*reduceRows_gpu<int, int, short>*/,
-                reduceRows_gpu<int, int, int>,
-                reduceRows_gpu<int, int, float>
+                0/*::reduce::rows<int, int, unsigned char>*/,
+                0/*::reduce::rows<int, int, signed char>*/,
+                0/*::reduce::rows<int, int, unsigned short>*/,
+                0/*::reduce::rows<int, int, short>*/,
+                ::reduce::rows<int, int, int>,
+                ::reduce::rows<int, float, float>,
+                ::reduce::rows<int, double, double>
             },
             {
-                0/*reduceRows_gpu<float, float, unsigned char>*/,
-                0/*reduceRows_gpu<float, float, signed char>*/,
-                0/*reduceRows_gpu<float, float, unsigned short>*/,
-                0/*reduceRows_gpu<float, float, short>*/,
-                0/*reduceRows_gpu<float, float, int>*/,
-                reduceRows_gpu<float, float, float>
+                0/*::reduce::rows<float, float, unsigned char>*/,
+                0/*::reduce::rows<float, float, signed char>*/,
+                0/*::reduce::rows<float, float, unsigned short>*/,
+                0/*::reduce::rows<float, float, short>*/,
+                0/*::reduce::rows<float, float, int>*/,
+                ::reduce::rows<float, float, float>,
+                ::reduce::rows<float, double, double>
+            },
+            {
+                0/*::reduce::rows<double, double, unsigned char>*/,
+                0/*::reduce::rows<double, double, signed char>*/,
+                0/*::reduce::rows<double, double, unsigned short>*/,
+                0/*::reduce::rows<double, double, short>*/,
+                0/*::reduce::rows<double, double, int>*/,
+                0/*::reduce::rows<double, double, float>*/,
+                ::reduce::rows<double, double, double>
             }
         };
 
-        const caller_t func = callers[src.depth()][dst.depth()];
+        const func_t func = funcs[src.depth()][dst.depth()];
 
         if (!func)
             CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
 
-        func(src.reshape(1), dst.reshape(1), reduceOp, StreamAccessor::getStream(stream));
+        func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream));
     }
     else
     {
-        typedef void (*caller_t)(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-
-        static const caller_t callers[6][6] =
+        typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+        static const func_t funcs[7][7] =
         {
             {
-                reduceCols_gpu<unsigned char, int, unsigned char>,
-                0/*reduceCols_gpu<unsigned char, int, signed char>*/,
-                0/*reduceCols_gpu<unsigned char, int, unsigned short>*/,
-                0/*reduceCols_gpu<unsigned char, int, short>*/,
-                reduceCols_gpu<unsigned char, int, int>,
-                reduceCols_gpu<unsigned char, int, float>
+                ::reduce::cols<unsigned char, int, unsigned char>,
+                0/*::reduce::cols<unsigned char, int, signed char>*/,
+                0/*::reduce::cols<unsigned char, int, unsigned short>*/,
+                0/*::reduce::cols<unsigned char, int, short>*/,
+                ::reduce::cols<unsigned char, int, int>,
+                ::reduce::cols<unsigned char, float, float>,
+                ::reduce::cols<unsigned char, double, double>
             },
             {
-                0/*reduceCols_gpu<signed char, int, unsigned char>*/,
-                0/*reduceCols_gpu<signed char, int, signed char>*/,
-                0/*reduceCols_gpu<signed char, int, unsigned short>*/,
-                0/*reduceCols_gpu<signed char, int, short>*/,
-                0/*reduceCols_gpu<signed char, int, int>*/,
-                0/*reduceCols_gpu<signed char, int, float>*/
+                0/*::reduce::cols<signed char, int, unsigned char>*/,
+                0/*::reduce::cols<signed char, int, signed char>*/,
+                0/*::reduce::cols<signed char, int, unsigned short>*/,
+                0/*::reduce::cols<signed char, int, short>*/,
+                0/*::reduce::cols<signed char, int, int>*/,
+                0/*::reduce::cols<signed char, float, float>*/,
+                0/*::reduce::cols<signed char, double, double>*/
             },
             {
-                0/*reduceCols_gpu<unsigned short, int, unsigned char>*/,
-                0/*reduceCols_gpu<unsigned short, int, signed char>*/,
-                reduceCols_gpu<unsigned short, int, unsigned short>,
-                0/*reduceCols_gpu<unsigned short, int, short>*/,
-                reduceCols_gpu<unsigned short, int, int>,
-                reduceCols_gpu<unsigned short, int, float>
+                0/*::reduce::cols<unsigned short, int, unsigned char>*/,
+                0/*::reduce::cols<unsigned short, int, signed char>*/,
+                ::reduce::cols<unsigned short, int, unsigned short>,
+                0/*::reduce::cols<unsigned short, int, short>*/,
+                ::reduce::cols<unsigned short, int, int>,
+                ::reduce::cols<unsigned short, float, float>,
+                ::reduce::cols<unsigned short, double, double>
             },
             {
-                0/*reduceCols_gpu<short, int, unsigned char>*/,
-                0/*reduceCols_gpu<short, int, signed char>*/,
-                0/*reduceCols_gpu<short, int, unsigned short>*/,
-                reduceCols_gpu<short, int, short>,
-                reduceCols_gpu<short, int, int>,
-                reduceCols_gpu<short, int, float>
+                0/*::reduce::cols<short, int, unsigned char>*/,
+                0/*::reduce::cols<short, int, signed char>*/,
+                0/*::reduce::cols<short, int, unsigned short>*/,
+                ::reduce::cols<short, int, short>,
+                ::reduce::cols<short, int, int>,
+                ::reduce::cols<short, float, float>,
+                ::reduce::cols<short, double, double>
             },
             {
-                0/*reduceCols_gpu<int, int, unsigned char>*/,
-                0/*reduceCols_gpu<int, int, signed char>*/,
-                0/*reduceCols_gpu<int, int, unsigned short>*/,
-                0/*reduceCols_gpu<int, int, short>*/,
-                reduceCols_gpu<int, int, int>,
-                reduceCols_gpu<int, int, float>
+                0/*::reduce::cols<int, int, unsigned char>*/,
+                0/*::reduce::cols<int, int, signed char>*/,
+                0/*::reduce::cols<int, int, unsigned short>*/,
+                0/*::reduce::cols<int, int, short>*/,
+                ::reduce::cols<int, int, int>,
+                ::reduce::cols<int, float, float>,
+                ::reduce::cols<int, double, double>
             },
             {
-                0/*reduceCols_gpu<float, unsigned char>*/,
-                0/*reduceCols_gpu<float, signed char>*/,
-                0/*reduceCols_gpu<float, unsigned short>*/,
-                0/*reduceCols_gpu<float, short>*/,
-                0/*reduceCols_gpu<float, int>*/,
-                reduceCols_gpu<float, float, float>
+                0/*::reduce::cols<float, float, unsigned char>*/,
+                0/*::reduce::cols<float, float, signed char>*/,
+                0/*::reduce::cols<float, float, unsigned short>*/,
+                0/*::reduce::cols<float, float, short>*/,
+                0/*::reduce::cols<float, float, int>*/,
+                ::reduce::cols<float, float, float>,
+                ::reduce::cols<float, double, double>
+            },
+            {
+                0/*::reduce::cols<double, double, unsigned char>*/,
+                0/*::reduce::cols<double, double, signed char>*/,
+                0/*::reduce::cols<double, double, unsigned short>*/,
+                0/*::reduce::cols<double, double, short>*/,
+                0/*::reduce::cols<double, double, int>*/,
+                0/*::reduce::cols<double, double, float>*/,
+                ::reduce::cols<double, double, double>
             }
         };
 
-        const caller_t func = callers[src.depth()][dst.depth()];
+        const func_t func = funcs[src.depth()][dst.depth()];
 
         if (!func)
             CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
 
-        func(src, src.channels(), dst, reduceOp, StreamAccessor::getStream(stream));
+        func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream));
     }
 }
 
diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
index 2a8f419593..fb057ae79d 100644
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@@ -65,6 +65,8 @@
 #include "NPP_staging/NPP_staging.hpp"
 #include "NCVRuntimeTemplates.hpp"
 #include "NCVHaarObjectDetection.hpp"
+#include "opencv2/gpu/device/warp.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"
 
 
 //==============================================================================
@@ -81,6 +83,20 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th
 //assuming size <= WARP_SIZE and size is power of 2
 __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
 {
+#if __CUDA_ARCH__ >= 300
+    const unsigned int laneId = cv::gpu::device::Warp::laneId();
+
+    // scan on shuffl functions
+    #pragma unroll
+    for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
+    {
+        const Ncv32u n = cv::gpu::device::shfl_up(idata, i);
+        if (laneId >= i)
+              idata += n;
+    }
+
+    return idata;
+#else
     Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
     s_Data[pos] = 0;
     pos += K_WARP_SIZE;
@@ -93,6 +109,7 @@ __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
     s_Data[pos] += s_Data[pos - 16];
 
     return s_Data[pos];
+#endif
 }
 
 __device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
@@ -2317,4 +2334,4 @@ NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
     return NCV_SUCCESS;
 }
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
index a3a1075fda..f4ec9aace6 100644
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@@ -44,6 +44,8 @@
 #include <vector>
 #include <cuda_runtime.h>
 #include "NPP_staging.hpp"
+#include "opencv2/gpu/device/warp.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"
 
 
 texture<Ncv8u,  1, cudaReadModeElementType> tex8u;
@@ -90,6 +92,36 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th
 //assuming size <= WARP_SIZE and size is power of 2
 template <class T>
 inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
+{
+#if __CUDA_ARCH__ >= 300
+    const unsigned int laneId = cv::gpu::device::Warp::laneId();
+
+    // scan on shuffl functions
+    #pragma unroll
+    for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
+    {
+        const T n = cv::gpu::device::shfl_up(idata, i);
+        if (laneId >= i)
+              idata += n;
+    }
+
+    return idata;
+#else
+    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
+    s_Data[pos] = 0;
+    pos += K_WARP_SIZE;
+    s_Data[pos] = idata;
+
+    s_Data[pos] += s_Data[pos - 1];
+    s_Data[pos] += s_Data[pos - 2];
+    s_Data[pos] += s_Data[pos - 4];
+    s_Data[pos] += s_Data[pos - 8];
+    s_Data[pos] += s_Data[pos - 16];
+
+    return s_Data[pos];
+#endif
+}
+inline __device__ Ncv64u warpScanInclusive(Ncv64u idata, volatile Ncv64u *s_Data)
 {
     Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
     s_Data[pos] = 0;
@@ -2578,4 +2610,4 @@ NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
     return status;
 }
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/nvidia/core/NCV.cu b/modules/gpu/src/nvidia/core/NCV.cu
index 5d1b5d12c0..77e59cc5c1 100644
--- a/modules/gpu/src/nvidia/core/NCV.cu
+++ b/modules/gpu/src/nvidia/core/NCV.cu
@@ -45,8 +45,6 @@
 #include <vector>
 #include "NCV.hpp"
 
-using namespace std;
-
 
 //==============================================================================
 //
@@ -55,16 +53,16 @@ using namespace std;
 //==============================================================================
 
 
-static void stdDebugOutput(const string &msg)
+static void stdDebugOutput(const std::string &msg)
 {
-    cout << msg;
+    std::cout << msg;
 }
 
 
 static NCVDebugOutputHandler *debugOutputHandler = stdDebugOutput;
 
 
-void ncvDebugOutput(const string &msg)
+void ncvDebugOutput(const std::string &msg)
 {
     debugOutputHandler(msg);
 }
diff --git a/modules/gpu/src/nvidia/core/NCV.hpp b/modules/gpu/src/nvidia/core/NCV.hpp
index ddac47c924..26b1d4ef14 100644
--- a/modules/gpu/src/nvidia/core/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
@@ -288,7 +288,7 @@ NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
     do \
     { \
         cudaError_t res = cudacall; \
-        ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << res, errCode); \
+        ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << (int)res, errCode); \
     } while (0)
 
 
diff --git a/modules/gpu/src/optflowbm.cpp b/modules/gpu/src/optflowbm.cpp
new file mode 100644
index 0000000000..a4321c89cc
--- /dev/null
+++ b/modules/gpu/src/optflowbm.cpp
@@ -0,0 +1,243 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::gpu::calcOpticalFlowBM(const GpuMat&, const GpuMat&, Size, Size, Size, bool, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+
+void cv::gpu::FastOpticalFlowBM::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
+
+#else // HAVE_CUDA
+
+namespace optflowbm
+{
+    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
+              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream);
+}
+
+void cv::gpu::calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr, Size blockSize, Size shiftSize, Size maxRange, bool usePrevious, GpuMat& velx, GpuMat& vely, GpuMat& buf, Stream& st)
+{
+    CV_Assert( prev.type() == CV_8UC1 );
+    CV_Assert( curr.size() == prev.size() && curr.type() == prev.type() );
+
+    const Size velSize((prev.cols - blockSize.width + shiftSize.width) / shiftSize.width,
+                       (prev.rows - blockSize.height + shiftSize.height) / shiftSize.height);
+
+    velx.create(velSize, CV_32FC1);
+    vely.create(velSize, CV_32FC1);
+
+    // scanning scheme coordinates
+    vector<short2> ss((2 * maxRange.width + 1) * (2 * maxRange.height + 1));
+    int ssCount = 0;
+
+    // Calculate scanning scheme
+    const int minCount = std::min(maxRange.width, maxRange.height);
+
+    // use spiral search pattern
+    //
+    //     9 10 11 12
+    //     8  1  2 13
+    //     7  *  3 14
+    //     6  5  4 15
+    //... 20 19 18 17
+    //
+
+    for (int i = 0; i < minCount; ++i)
+    {
+        // four cycles along sides
+        int x = -i - 1, y = x;
+
+        // upper side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = ++x;
+            ss[ssCount].y = y;
+        }
+
+        // right side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = x;
+            ss[ssCount].y = ++y;
+        }
+
+        // bottom side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = --x;
+            ss[ssCount].y = y;
+        }
+
+        // left side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = x;
+            ss[ssCount].y = --y;
+        }
+    }
+
+    // the rest part
+    if (maxRange.width < maxRange.height)
+    {
+        const int xleft = -minCount;
+
+        // cycle by neighbor rings
+        for (int i = minCount; i < maxRange.height; ++i)
+        {
+            // two cycles by x
+            int y = -(i + 1);
+            int x = xleft;
+
+            // upper side
+            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
+            {
+                ss[ssCount].x = x;
+                ss[ssCount].y = y;
+            }
+
+            x = xleft;
+            y = -y;
+
+            // bottom side
+            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
+            {
+                ss[ssCount].x = x;
+                ss[ssCount].y = y;
+            }
+        }
+    }
+    else if (maxRange.width > maxRange.height)
+    {
+        const int yupper = -minCount;
+
+        // cycle by neighbor rings
+        for (int i = minCount; i < maxRange.width; ++i)
+        {
+            // two cycles by y
+            int x = -(i + 1);
+            int y = yupper;
+
+            // left side
+            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
+            {
+                ss[ssCount].x = x;
+                ss[ssCount].y = y;
+            }
+
+            y = yupper;
+            x = -x;
+
+            // right side
+            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
+            {
+                ss[ssCount].x = x;
+                ss[ssCount].y = y;
+            }
+        }
+    }
+
+    const cudaStream_t stream = StreamAccessor::getStream(st);
+
+    ensureSizeIsEnough(1, ssCount, CV_16SC2, buf);
+    if (stream == 0)
+        cudaSafeCall( cudaMemcpy(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice) );
+    else
+        cudaSafeCall( cudaMemcpyAsync(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice, stream) );
+
+    const int maxX = prev.cols - blockSize.width;
+    const int maxY = prev.rows - blockSize.height;
+
+    const int SMALL_DIFF = 2;
+    const int BIG_DIFF = 128;
+
+    const int blSize = blockSize.area();
+    const int acceptLevel = blSize * SMALL_DIFF;
+    const int escapeLevel = blSize * BIG_DIFF;
+
+    optflowbm::calc(prev, curr, velx, vely,
+                    make_int2(blockSize.width, blockSize.height), make_int2(shiftSize.width, shiftSize.height), usePrevious,
+                    maxX, maxY, acceptLevel, escapeLevel, buf.ptr<short2>(), ssCount, stream);
+}
+
+namespace optflowbm_fast
+{
+    void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows);
+
+    template <typename T>
+    void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
+}
+
+void cv::gpu::FastOpticalFlowBM::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window, int block_window, Stream& stream)
+{
+    CV_Assert( I0.type() == CV_8UC1 );
+    CV_Assert( I1.size() == I0.size() && I1.type() == I0.type() );
+
+    int border_size = search_window / 2 + block_window / 2;
+    Size esize = I0.size() + Size(border_size, border_size) * 2;
+
+    ensureSizeIsEnough(esize, I0.type(), extended_I0);
+    ensureSizeIsEnough(esize, I0.type(), extended_I1);
+
+    copyMakeBorder(I0, extended_I0, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream);
+    copyMakeBorder(I1, extended_I1, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream);
+
+    GpuMat I0_hdr = extended_I0(Rect(Point2i(border_size, border_size), I0.size()));
+    GpuMat I1_hdr = extended_I1(Rect(Point2i(border_size, border_size), I0.size()));
+
+    int bcols, brows;
+    optflowbm_fast::get_buffer_size(I0.cols, I0.rows, search_window, block_window, bcols, brows);
+
+    ensureSizeIsEnough(brows, bcols, CV_32SC1, buffer);
+
+    flowx.create(I0.size(), CV_32FC1);
+    flowy.create(I0.size(), CV_32FC1);
+
+    optflowbm_fast::calc<uchar>(I0_hdr, I1_hdr, flowx, flowy, buffer, search_window, block_window, StreamAccessor::getStream(stream));
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/gpu/src/optical_flow_farneback.cpp b/modules/gpu/src/optical_flow_farneback.cpp
index 44edda474c..91056a67a4 100644
--- a/modules/gpu/src/optical_flow_farneback.cpp
+++ b/modules/gpu/src/optical_flow_farneback.cpp
@@ -172,7 +172,7 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_boxFilter(
         const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
         GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
 {
-    if (!isDeviceArch11_)
+    if (deviceSupports(FEATURE_SET_COMPUTE_12))
         device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
     else
         device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, S(streams[0]));
@@ -191,7 +191,7 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_gaussianBlur(
         const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
         GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
 {
-    if (!isDeviceArch11_)
+    if (deviceSupports(FEATURE_SET_COMPUTE_12))
         device::optflow_farneback::gaussianBlur5Gpu(
                     M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
     else
@@ -209,7 +209,7 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_gaussianBlur(
 void cv::gpu::FarnebackOpticalFlow::operator ()(
         const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s)
 {
-    CV_Assert(frame0.type() == CV_8U && frame1.type() == CV_8U);
+    CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
     CV_Assert(frame0.size() == frame1.size());
     CV_Assert(polyN == 5 || polyN == 7);
     CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6);
diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp
index c37332ae45..7c53185107 100644
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -97,12 +97,12 @@
         #include <cublas.h>
     #endif
 
-    #ifndef __APPLE__
+    #ifdef HAVE_NVCUVID
         #include <nvcuvid.h>
-    #endif
 
-    #ifdef WIN32
-        #include <NVEncoderAPI.h>
+        #ifdef WIN32
+            #include <NVEncoderAPI.h>
+        #endif
     #endif
 
     #include "internal_shared.hpp"
diff --git a/modules/gpu/src/pyrlk.cpp b/modules/gpu/src/pyrlk.cpp
index 47ab90415f..49a6c5a88c 100644
--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
@@ -55,21 +55,18 @@ void cv::gpu::PyrLKOpticalFlow::releaseMemory() {}
 
 #else /* !defined (HAVE_CUDA) */
 
-namespace cv { namespace gpu { namespace device
+namespace pyrlk
 {
-    namespace pyrlk
-    {
-        void loadConstants(int2 winSize, int iters);
+    void loadConstants(int2 winSize, int iters);
 
-        void lkSparse1_gpu(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-            int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
-        void lkSparse4_gpu(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-            int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
+    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
+    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
 
-        void lkDense_gpu(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
-                         PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
-    }
-}}}
+    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
+               PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
+}
 
 cv::gpu::PyrLKOpticalFlow::PyrLKOpticalFlow()
 {
@@ -77,22 +74,21 @@ cv::gpu::PyrLKOpticalFlow::PyrLKOpticalFlow()
     maxLevel = 3;
     iters = 30;
     useInitialFlow = false;
-    isDeviceArch11_ = !DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
 }
 
 namespace
 {
-    void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch, bool isDeviceArch11)
+    void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch)
     {
         if (winSize.width > 32 && winSize.width > 2 * winSize.height)
         {
-            block.x = isDeviceArch11 ? 16 : 32;
+            block.x = deviceSupports(FEATURE_SET_COMPUTE_12) ? 32 : 16;
             block.y = 8;
         }
         else
         {
             block.x = 16;
-            block.y = isDeviceArch11 ? 8 : 16;
+            block.y = deviceSupports(FEATURE_SET_COMPUTE_12) ? 16 : 8;
         }
 
         patch.x = (winSize.width  + block.x - 1) / block.x;
@@ -104,8 +100,6 @@ namespace
 
 void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err)
 {
-    using namespace cv::gpu::device::pyrlk;
-
     if (prevPts.empty())
     {
         nextPts.release();
@@ -115,9 +109,9 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
     }
 
     dim3 block, patch;
-    calcPatchSize(winSize, block, patch, isDeviceArch11_);
+    calcPatchSize(winSize, block, patch);
 
-    CV_Assert(prevImg.type() == CV_8UC1 || prevImg.type() == CV_8UC3 || prevImg.type() == CV_8UC4);
+    CV_Assert(prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4);
     CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
     CV_Assert(maxLevel >= 0);
     CV_Assert(winSize.width > 2 && winSize.height > 2);
@@ -166,19 +160,19 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
         pyrDown(nextPyr_[level - 1], nextPyr_[level]);
     }
 
-    loadConstants(make_int2(winSize.width, winSize.height), iters);
+    pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters);
 
     for (int level = maxLevel; level >= 0; level--)
     {
         if (cn == 1)
         {
-            lkSparse1_gpu(prevPyr_[level], nextPyr_[level],
+            pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
                 prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
                 level, block, patch);
         }
         else
         {
-            lkSparse4_gpu(prevPyr_[level], nextPyr_[level],
+            pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
                 prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
                 level, block, patch);
         }
@@ -187,8 +181,6 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
 
 void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err)
 {
-    using namespace cv::gpu::device::pyrlk;
-
     CV_Assert(prevImg.type() == CV_8UC1);
     CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
     CV_Assert(maxLevel >= 0);
@@ -219,7 +211,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
     vPyr_[1].setTo(Scalar::all(0));
 
     int2 winSize2i = make_int2(winSize.width, winSize.height);
-    loadConstants(winSize2i, iters);
+    pyrlk::loadConstants(winSize2i, iters);
 
     PtrStepSzf derr = err ? *err : PtrStepSzf();
 
@@ -229,7 +221,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
     {
         int idx2 = (idx + 1) & 1;
 
-        lkDense_gpu(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
+        pyrlk::dense(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
             level == 0 ? derr : PtrStepSzf(), winSize2i);
 
         if (level > 0)
diff --git a/modules/gpu/src/remap.cpp b/modules/gpu/src/remap.cpp
index f4d51f1878..4b87286331 100644
--- a/modules/gpu/src/remap.cpp
+++ b/modules/gpu/src/remap.cpp
@@ -54,7 +54,7 @@ namespace cv { namespace gpu { namespace device
     {
         template <typename T>
         void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst,
-                       int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+                       int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
     }
 }}}
 
@@ -63,7 +63,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
     using namespace cv::gpu::device::imgproc;
 
     typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation,
-        int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
     static const func_t funcs[6][4] =
     {
@@ -91,15 +91,12 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
     Scalar_<float> borderValueFloat;
     borderValueFloat = borderValue;
 
-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
     Size wholeSize;
     Point ofs;
     src.locateROI(wholeSize, ofs);
 
     func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, xmap, ymap,
-        dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream), cc);
+        dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
 }
 
 #endif // HAVE_CUDA
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
new file mode 100644
index 0000000000..695fab58c9
--- /dev/null
+++ b/modules/gpu/src/softcascade.cpp
@@ -0,0 +1,675 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <precomp.hpp>
+
+#if !defined (HAVE_CUDA)
+cv::gpu::SCascade::SCascade(const double, const double, const int, const int) { throw_nogpu(); }
+
+cv::gpu::SCascade::~SCascade() { throw_nogpu(); }
+
+bool cv::gpu::SCascade::load(const FileNode&) { throw_nogpu(); return false;}
+
+void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, Stream&) const { throw_nogpu(); }
+
+void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
+
+cv::gpu::ChannelsProcessor::ChannelsProcessor() { throw_nogpu(); }
+ cv::gpu::ChannelsProcessor::~ChannelsProcessor() { throw_nogpu(); }
+
+cv::Ptr<cv::gpu::ChannelsProcessor> cv::gpu::ChannelsProcessor::create(const int, const int, const int)
+{ throw_nogpu(); return cv::Ptr<cv::gpu::ChannelsProcessor>(0); }
+
+#else
+# include <icf.hpp>
+
+cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h)
+:  octave(idx), step(oct.stages), relScale(scale / oct.scale)
+{
+    workRect.x = cvRound(w / (float)oct.shrinkage);
+    workRect.y = cvRound(h / (float)oct.shrinkage);
+
+    objSize.x  = cv::saturate_cast<uchar>(oct.size.x * relScale);
+    objSize.y  = cv::saturate_cast<uchar>(oct.size.y * relScale);
+
+    // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers
+    if (fabs(relScale - 1.f) < FLT_EPSILON)
+        scaling[0] = scaling[1] = 1.f;
+    else
+    {
+        scaling[0] = (relScale < 1.f) ? 0.89f * ::pow(relScale, 1.099f / ::log(2.0f)) : 1.f;
+        scaling[1] = relScale * relScale;
+    }
+}
+
+namespace cv { namespace gpu { namespace device {
+
+namespace icf {
+    void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
+        const int fw, const int fh, const int bins, cudaStream_t stream);
+
+    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
+        PtrStepSzb suppressed, cudaStream_t stream);
+
+    void bgr2Luv(const PtrStepSzb& bgr, PtrStepSzb luv);
+    void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins);
+    void shrink(const cv::gpu::PtrStepSzb& channels, cv::gpu::PtrStepSzb shrunk);
+}
+
+// namespace imgproc {
+//     void shfl_integral_gpu_buffered(PtrStepSzb, PtrStepSz<uint4>, PtrStepSz<unsigned int>, int, cudaStream_t);
+
+//     template <typename T>
+//     void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
+//                     PtrStepSzb dst, int interpolation, cudaStream_t stream);
+// }
+
+}}}
+
+struct cv::gpu::SCascade::Fields
+{
+    static Fields* parseCascade(const FileNode &root, const float mins, const float maxs, const int totals, const int method)
+    {
+        static const char *const SC_STAGE_TYPE          = "stageType";
+        static const char *const SC_BOOST               = "BOOST";
+
+        static const char *const SC_FEATURE_TYPE        = "featureType";
+        static const char *const SC_ICF                 = "ICF";
+
+        // only Ada Boost supported
+        std::string stageTypeStr = (string)root[SC_STAGE_TYPE];
+        CV_Assert(stageTypeStr == SC_BOOST);
+
+        // only HOG-like integral channel features cupported
+        string featureTypeStr = (string)root[SC_FEATURE_TYPE];
+        CV_Assert(featureTypeStr == SC_ICF);
+
+        static const char *const SC_ORIG_W              = "width";
+        static const char *const SC_ORIG_H              = "height";
+
+        int origWidth  = (int)root[SC_ORIG_W];
+        int origHeight = (int)root[SC_ORIG_H];
+
+        static const char *const SC_OCTAVES             = "octaves";
+        static const char *const SC_STAGES              = "stages";
+        static const char *const SC_FEATURES            = "features";
+
+        static const char *const SC_WEEK                = "weakClassifiers";
+        static const char *const SC_INTERNAL            = "internalNodes";
+        static const char *const SC_LEAF                = "leafValues";
+
+        static const char *const SC_OCT_SCALE           = "scale";
+        static const char *const SC_OCT_STAGES          = "stageNum";
+        static const char *const SC_OCT_SHRINKAGE       = "shrinkingFactor";
+
+        static const char *const SC_STAGE_THRESHOLD     = "stageThreshold";
+
+        static const char * const SC_F_CHANNEL          = "channel";
+        static const char * const SC_F_RECT             = "rect";
+
+        FileNode fn = root[SC_OCTAVES];
+        if (fn.empty()) return false;
+
+        using namespace device::icf;
+
+        std::vector<Octave>  voctaves;
+        std::vector<float>   vstages;
+        std::vector<Node>    vnodes;
+        std::vector<float>   vleaves;
+
+        FileNodeIterator it = fn.begin(), it_end = fn.end();
+        int feature_offset = 0;
+        ushort octIndex = 0;
+        ushort shrinkage = 1;
+
+        for (; it != it_end; ++it)
+        {
+            FileNode fns = *it;
+            float scale = (float)fns[SC_OCT_SCALE];
+
+            bool isUPOctave = scale >= 1;
+
+            ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
+            ushort2 size;
+            size.x = cvRound(origWidth * scale);
+            size.y = cvRound(origHeight * scale);
+            shrinkage = saturate_cast<ushort>((int)fns[SC_OCT_SHRINKAGE]);
+
+            Octave octave(octIndex, nstages, shrinkage, size, scale);
+            CV_Assert(octave.stages > 0);
+            voctaves.push_back(octave);
+
+            FileNode ffs = fns[SC_FEATURES];
+            if (ffs.empty()) return false;
+
+            FileNodeIterator ftrs = ffs.begin();
+
+            fns = fns[SC_STAGES];
+            if (fn.empty()) return false;
+
+            // for each stage (~ decision tree with H = 2)
+            FileNodeIterator st = fns.begin(), st_end = fns.end();
+            for (; st != st_end; ++st )
+            {
+                fns = *st;
+                vstages.push_back((float)fns[SC_STAGE_THRESHOLD]);
+
+                fns = fns[SC_WEEK];
+                FileNodeIterator ftr = fns.begin(), ft_end = fns.end();
+                for (; ftr != ft_end; ++ftr)
+                {
+                    fns = (*ftr)[SC_INTERNAL];
+                    FileNodeIterator inIt = fns.begin(), inIt_end = fns.end();
+                    for (; inIt != inIt_end;)
+                    {
+                        // int feature = (int)(*(inIt +=2)) + feature_offset;
+                        inIt +=3;
+                        // extract feature, Todo:check it
+                        unsigned int th = saturate_cast<unsigned int>((float)(*(inIt++)));
+                        cv::FileNode ftn = (*ftrs)[SC_F_RECT];
+                        cv::FileNodeIterator r_it = ftn.begin();
+                        uchar4 rect;
+                        rect.x = saturate_cast<uchar>((int)*(r_it++));
+                        rect.y = saturate_cast<uchar>((int)*(r_it++));
+                        rect.z = saturate_cast<uchar>((int)*(r_it++));
+                        rect.w = saturate_cast<uchar>((int)*(r_it++));
+
+                        if (isUPOctave)
+                        {
+                            rect.z -= rect.x;
+                            rect.w -= rect.y;
+                        }
+
+                        unsigned int channel = saturate_cast<unsigned int>((int)(*ftrs)[SC_F_CHANNEL]);
+                        vnodes.push_back(Node(rect, channel, th));
+                        ++ftrs;
+                    }
+
+                    fns = (*ftr)[SC_LEAF];
+                    inIt = fns.begin(), inIt_end = fns.end();
+                    for (; inIt != inIt_end; ++inIt)
+                        vleaves.push_back((float)(*inIt));
+                }
+            }
+
+            feature_offset += octave.stages * 3;
+            ++octIndex;
+        }
+
+        cv::Mat hoctaves(1, (int) (voctaves.size() * sizeof(Octave)), CV_8UC1, (uchar*)&(voctaves[0]));
+        CV_Assert(!hoctaves.empty());
+
+        cv::Mat hstages(cv::Mat(vstages).reshape(1,1));
+        CV_Assert(!hstages.empty());
+
+        cv::Mat hnodes(1, (int) (vnodes.size() * sizeof(Node)), CV_8UC1, (uchar*)&(vnodes[0]) );
+        CV_Assert(!hnodes.empty());
+
+        cv::Mat hleaves(cv::Mat(vleaves).reshape(1,1));
+        CV_Assert(!hleaves.empty());
+
+        Fields* fields = new Fields(mins, maxs, totals, origWidth, origHeight, shrinkage, 0,
+            hoctaves, hstages, hnodes, hleaves, method);
+        fields->voctaves = voctaves;
+        fields->createLevels(DEFAULT_FRAME_HEIGHT, DEFAULT_FRAME_WIDTH);
+
+        return fields;
+    }
+
+    bool check(float mins,float  maxs, int scales)
+    {
+        bool updated = (minScale == mins) || (maxScale == maxs) || (totals = scales);
+
+        minScale = mins;
+        maxScale = maxScale;
+        totals   = scales;
+
+        return updated;
+    }
+
+    int createLevels(const int fh, const int fw)
+    {
+        using namespace device::icf;
+        std::vector<Level> vlevels;
+        float logFactor = (::log(maxScale) - ::log(minScale)) / (totals -1);
+
+        float scale = minScale;
+        int dcs = 0;
+        for (int sc = 0; sc < totals; ++sc)
+        {
+            int width  = ::std::max(0.0f, fw - (origObjWidth  * scale));
+            int height = ::std::max(0.0f, fh - (origObjHeight * scale));
+
+            float logScale = ::log(scale);
+            int fit = fitOctave(voctaves, logScale);
+
+            Level level(fit, voctaves[fit], scale, width, height);
+
+            if (!width || !height)
+                break;
+            else
+            {
+                vlevels.push_back(level);
+                if (voctaves[fit].scale < 1) ++dcs;
+            }
+
+            if (::fabs(scale - maxScale) < FLT_EPSILON) break;
+            scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
+        }
+
+        cv::Mat hlevels = cv::Mat(1, (int) (vlevels.size() * sizeof(Level)), CV_8UC1, (uchar*)&(vlevels[0]) );
+        CV_Assert(!hlevels.empty());
+        levels.upload(hlevels);
+        downscales = dcs;
+        return dcs;
+    }
+
+    bool update(int fh, int fw, int shr)
+    {
+        shrunk.create(fh / shr * HOG_LUV_BINS, fw / shr, CV_8UC1);
+        integralBuffer.create(shrunk.rows, shrunk.cols, CV_32SC1);
+
+        hogluv.create((fh / shr) * HOG_LUV_BINS + 1, fw / shr + 1, CV_32SC1);
+        hogluv.setTo(cv::Scalar::all(0));
+
+        overlaps.create(1, 5000, CV_8UC1);
+        suppressed.create(1, sizeof(Detection) * 51, CV_8UC1);
+
+        return true;
+    }
+
+    Fields( const float mins, const float maxs, const int tts, const int ow, const int oh, const int shr, const int ds,
+        cv::Mat hoctaves, cv::Mat hstages, cv::Mat hnodes, cv::Mat hleaves, int method)
+    : minScale(mins), maxScale(maxs), totals(tts), origObjWidth(ow), origObjHeight(oh), shrinkage(shr), downscales(ds)
+    {
+        update(DEFAULT_FRAME_HEIGHT, DEFAULT_FRAME_WIDTH, shr);
+        octaves.upload(hoctaves);
+        stages.upload(hstages);
+        nodes.upload(hnodes);
+        leaves.upload(hleaves);
+
+        preprocessor = ChannelsProcessor::create(shrinkage, 6, method);
+    }
+
+    void detect(cv::gpu::GpuMat& objects, Stream& s) const
+    {
+        if (s)
+            s.enqueueMemSet(objects, 0);
+        else
+            cudaMemset(objects.data, 0, sizeof(Detection));
+
+        cudaSafeCall( cudaGetLastError());
+
+        device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker
+        = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, stages, nodes, leaves);
+
+        cudaStream_t stream = StreamAccessor::getStream(s);
+        invoker(mask, hogluv, objects, downscales, stream);
+    }
+
+    void suppress(GpuMat& objects, Stream& s)
+    {
+        GpuMat ndetections = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
+        ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);
+
+        if (s)
+        {
+            s.enqueueMemSet(overlaps, 0);
+            s.enqueueMemSet(suppressed, 0);
+        }
+        else
+        {
+            overlaps.setTo(0);
+            suppressed.setTo(0);
+        }
+
+        cudaStream_t stream = StreamAccessor::getStream(s);
+        device::icf::suppress(objects, overlaps, ndetections, suppressed, stream);
+    }
+
+private:
+
+    typedef std::vector<device::icf::Octave>::const_iterator  octIt_t;
+    static int fitOctave(const std::vector<device::icf::Octave>& octs, const float& logFactor)
+    {
+        float minAbsLog = FLT_MAX;
+        int res =  0;
+        for (int oct = 0; oct < (int)octs.size(); ++oct)
+        {
+            const device::icf::Octave& octave =octs[oct];
+            float logOctave = ::log(octave.scale);
+            float logAbsScale = ::fabs(logFactor - logOctave);
+
+            if(logAbsScale < minAbsLog)
+            {
+                res = oct;
+                minAbsLog = logAbsScale;
+            }
+        }
+        return res;
+    }
+
+public:
+
+    cv::Ptr<ChannelsProcessor> preprocessor;
+
+    // scales range
+    float minScale;
+    float maxScale;
+
+    int totals;
+
+    int origObjWidth;
+    int origObjHeight;
+
+    const int shrinkage;
+    int downscales;
+
+
+    // 160x120x10
+    GpuMat shrunk;
+
+    // temporial mat for integrall
+    GpuMat integralBuffer;
+
+    // 161x121x10
+    GpuMat hogluv;
+
+
+    // used for suppression
+    GpuMat suppressed;
+    // used for area overlap computing during
+    GpuMat overlaps;
+
+
+    // Cascade from xml
+    GpuMat octaves;
+    GpuMat stages;
+    GpuMat nodes;
+    GpuMat leaves;
+    GpuMat levels;
+
+
+    // For ROI
+    GpuMat mask;
+    GpuMat genRoiTmp;
+
+//     GpuMat collected;
+
+
+    std::vector<device::icf::Octave> voctaves;
+
+//     DeviceInfo info;
+
+    enum { BOOST = 0 };
+    enum
+    {
+        DEFAULT_FRAME_WIDTH        = 640,
+        DEFAULT_FRAME_HEIGHT       = 480,
+        HOG_LUV_BINS               = 10
+    };
+};
+
+cv::gpu::SCascade::SCascade(const double mins, const double maxs, const int sc, const int fl)
+: fields(0),  minScale(mins), maxScale(maxs), scales(sc), flags(fl) {}
+
+cv::gpu::SCascade::~SCascade() { delete fields; }
+
+bool cv::gpu::SCascade::load(const FileNode& fn)
+{
+    if (fields) delete fields;
+    fields = Fields::parseCascade(fn, minScale, maxScale, scales, flags);
+    return fields != 0;
+}
+
+void cv::gpu::SCascade::detect(InputArray _image, InputArray _rois, OutputArray _objects, Stream& s) const
+{
+    CV_Assert(fields);
+
+    // only color images and precomputed integrals are supported
+    int type = _image.type();
+    CV_Assert(type == CV_8UC3 || type == CV_32SC1 || (!_rois.empty()));
+
+    const GpuMat image = _image.getGpuMat();
+
+    if (_objects.empty()) _objects.create(1, 4096 * sizeof(Detection), CV_8UC1);
+
+    GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
+
+    /// roi
+    Fields& flds = *fields;
+    int shr = flds.shrinkage;
+
+    flds.mask.create( rois.cols / shr, rois.rows / shr, rois.type());
+
+    cv::gpu::resize(rois, flds.genRoiTmp, cv::Size(), 1.f / shr, 1.f / shr, CV_INTER_AREA, s);
+    cv::gpu::transpose(flds.genRoiTmp, flds.mask, s);
+
+    if (type == CV_8UC3)
+    {
+        flds.update(image.rows, image.cols, flds.shrinkage);
+
+        if (flds.check(minScale, maxScale, scales))
+            flds.createLevels(image.rows, image.cols);
+
+        flds.preprocessor->apply(image, flds.shrunk);
+        cv::gpu::integralBuffered(flds.shrunk, flds.hogluv, flds.integralBuffer, s);
+    }
+    else
+    {
+        if (s)
+            s.enqueueCopy(image, flds.hogluv);
+        else
+            image.copyTo(flds.hogluv);
+    }
+
+    flds.detect(objects, s);
+
+    if ( (flags && NMS_MASK) != NO_REJECT)
+    {
+        GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
+        flds.suppress(objects, s);
+        flds.suppressed.copyTo(spr);
+    }
+}
+
+void cv::gpu::SCascade::read(const FileNode& fn)
+{
+    Algorithm::read(fn);
+}
+
+namespace {
+
+using cv::InputArray;
+using cv::OutputArray;
+using cv::gpu::Stream;
+using cv::gpu::GpuMat;
+
+inline void setZero(cv::gpu::GpuMat& m, Stream& s)
+{
+    if (s)
+        s.enqueueMemSet(m, 0);
+    else
+        m.setTo(0);
+}
+
+struct GenricPreprocessor : public cv::gpu::ChannelsProcessor
+{
+    GenricPreprocessor(const int s, const int b) : cv::gpu::ChannelsProcessor(), shrinkage(s), bins(b) {}
+    virtual ~GenricPreprocessor() {}
+
+    virtual void apply(InputArray _frame, OutputArray _shrunk, Stream& s = Stream::Null())
+    {
+        const GpuMat frame = _frame.getGpuMat();
+
+        _shrunk.create(frame.rows * (4 + bins) / shrinkage, frame.cols / shrinkage, CV_8UC1);
+        GpuMat shrunk = _shrunk.getGpuMat();
+
+        channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
+        setZero(channels, s);
+
+        cv::gpu::cvtColor(frame, gray, CV_BGR2GRAY, s);
+        createHogBins(s);
+
+        createLuvBins(frame, s);
+
+        cv::gpu::resize(channels, shrunk, cv::Size(), 1.f / shrinkage, 1.f / shrinkage, CV_INTER_AREA, s);
+    }
+
+private:
+
+    void createHogBins(Stream& s)
+    {
+        static const int fw = gray.cols;
+        static const int fh = gray.rows;
+
+        fplane.create(fh * HOG_BINS, fw, CV_32FC1);
+
+        GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
+        GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
+
+        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
+        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
+
+        GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
+        GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
+
+        cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s);
+
+        // normolize magnitude to uchar interval and angles to 6 bins
+        GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
+        GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
+
+        cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2.0f))), nmag, 1, -1, s);
+        cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang, 1, -1, s);
+
+        //create uchar magnitude
+        GpuMat cmag(channels, cv::Rect(0, fh * HOG_BINS, fw, fh));
+        if (s)
+            s.enqueueConvert(nmag, cmag, CV_8UC1);
+        else
+            nmag.convertTo(cmag, CV_8UC1);
+
+        cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
+        cv::gpu::device::icf::fillBins(channels, nang, fw, fh, HOG_BINS, stream);
+    }
+
+    void createLuvBins(const cv::gpu::GpuMat& colored, Stream& s)
+    {
+        static const int fw = colored.cols;
+        static const int fh = colored.rows;
+
+        cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);
+
+        std::vector<GpuMat> splited;
+        for(int i = 0; i < LUV_BINS; ++i)
+        {
+            splited.push_back(GpuMat(channels, cv::Rect(0, fh * (7 + i), fw, fh)));
+        }
+
+        cv::gpu::split(luv, splited, s);
+    }
+
+    enum {HOG_BINS = 6, LUV_BINS = 3};
+
+    const int shrinkage;
+    const int bins;
+
+    GpuMat gray;
+    GpuMat luv;
+    GpuMat channels;
+
+    // preallocated buffer for floating point operations
+    GpuMat fplane;
+    GpuMat sobelBuf;
+};
+
+
+struct SeparablePreprocessor : public cv::gpu::ChannelsProcessor
+{
+    SeparablePreprocessor(const int s, const int b) : cv::gpu::ChannelsProcessor(), shrinkage(s), bins(b) {}
+    virtual ~SeparablePreprocessor() {}
+
+    virtual void apply(InputArray _frame, OutputArray _shrunk, Stream& s = Stream::Null())
+    {
+        const GpuMat frame = _frame.getGpuMat();
+        cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);
+
+        _shrunk.create(frame.rows * (4 + bins) / shrinkage, frame.cols / shrinkage, CV_8UC1);
+        GpuMat shrunk = _shrunk.getGpuMat();
+
+        channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
+        setZero(channels, s);
+
+        cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
+        cv::gpu::device::icf::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);
+
+        cv::gpu::GpuMat luv(channels, cv::Rect(0, bgr.rows * (bins + 1), bgr.cols, bgr.rows * 3));
+        cv::gpu::device::icf::bgr2Luv(bgr, luv);
+        cv::gpu::device::icf::shrink(channels, shrunk);
+    }
+
+private:
+    const int shrinkage;
+    const int bins;
+
+    GpuMat bgr;
+    GpuMat gray;
+    GpuMat channels;
+};
+
+}
+
+cv::Ptr<cv::gpu::ChannelsProcessor> cv::gpu::ChannelsProcessor::create(const int s, const int b, const int m)
+{
+    CV_Assert((m && SEPARABLE) || (m && GENERIC));
+
+    if (m && GENERIC)
+        return cv::Ptr<cv::gpu::ChannelsProcessor>(new GenricPreprocessor(s, b));
+
+    return cv::Ptr<cv::gpu::ChannelsProcessor>(new SeparablePreprocessor(s, b));
+}
+
+cv::gpu::ChannelsProcessor::ChannelsProcessor() { }
+cv::gpu::ChannelsProcessor::~ChannelsProcessor() { }
+
+#endif
\ No newline at end of file
diff --git a/modules/gpu/src/split_merge.cpp b/modules/gpu/src/split_merge.cpp
index 0aa0e5896b..434c6e85cf 100644
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
@@ -78,7 +78,7 @@ namespace
 
         if (depth == CV_64F)
         {
-            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            if (!deviceSupports(NATIVE_DOUBLE))
                 CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
         }
 
@@ -122,7 +122,7 @@ namespace
 
         if (depth == CV_64F)
         {
-            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            if (!deviceSupports(NATIVE_DOUBLE))
                 CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
         }
 
diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp
index 5a1b074445..024087fe5a 100644
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@@ -86,8 +86,7 @@ namespace cv { namespace gpu { namespace device
 
         void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
 
-        void compute_descriptors_gpu(const PtrStepSzf& descriptors,
-            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
+        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
     }
 }}}
 
@@ -122,9 +121,6 @@ namespace
             CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
             CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
 
-            if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-                CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
             const int min_size = calcSize(surf_.nOctaves - 1, 0);
             CV_Assert(img_rows - min_size >= 0);
             CV_Assert(img_cols - min_size >= 0);
diff --git a/modules/gpu/src/thread_wrappers.cpp b/modules/gpu/src/thread_wrappers.cpp
index fcfa59827e..f8099aca91 100644
--- a/modules/gpu/src/thread_wrappers.cpp
+++ b/modules/gpu/src/thread_wrappers.cpp
@@ -42,7 +42,7 @@
 
 #include "thread_wrappers.h"
 
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
 
 #ifdef WIN32
     #define NOMINMAX
diff --git a/modules/gpu/src/thread_wrappers.h b/modules/gpu/src/thread_wrappers.h
index 9198d3e906..78e675216c 100644
--- a/modules/gpu/src/thread_wrappers.h
+++ b/modules/gpu/src/thread_wrappers.h
@@ -45,7 +45,7 @@
 
 #include "precomp.hpp"
 
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
 
 namespace cv { namespace gpu
 {
diff --git a/modules/gpu/src/tvl1flow.cpp b/modules/gpu/src/tvl1flow.cpp
new file mode 100644
index 0000000000..a598a9ecf0
--- /dev/null
+++ b/modules/gpu/src/tvl1flow.cpp
@@ -0,0 +1,256 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+cv::gpu::OpticalFlowDual_TVL1_GPU::OpticalFlowDual_TVL1_GPU() { throw_nogpu(); }
+void cv::gpu::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::OpticalFlowDual_TVL1_GPU::collectGarbage() {}
+void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
+
+#else
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+cv::gpu::OpticalFlowDual_TVL1_GPU::OpticalFlowDual_TVL1_GPU()
+{
+    tau            = 0.25;
+    lambda         = 0.15;
+    theta          = 0.3;
+    nscales        = 5;
+    warps          = 5;
+    epsilon        = 0.01;
+    iterations     = 300;
+    useInitialFlow = false;
+}
+
+void cv::gpu::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy)
+{
+    CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
+    CV_Assert( I0.size() == I1.size() );
+    CV_Assert( I0.type() == I1.type() );
+    CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
+    CV_Assert( nscales > 0 );
+
+    // allocate memory for the pyramid structure
+    I0s.resize(nscales);
+    I1s.resize(nscales);
+    u1s.resize(nscales);
+    u2s.resize(nscales);
+
+    I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0);
+    I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
+
+    if (!useInitialFlow)
+    {
+        flowx.create(I0.size(), CV_32FC1);
+        flowy.create(I0.size(), CV_32FC1);
+    }
+
+    u1s[0] = flowx;
+    u2s[0] = flowy;
+
+    I1x_buf.create(I0.size(), CV_32FC1);
+    I1y_buf.create(I0.size(), CV_32FC1);
+
+    I1w_buf.create(I0.size(), CV_32FC1);
+    I1wx_buf.create(I0.size(), CV_32FC1);
+    I1wy_buf.create(I0.size(), CV_32FC1);
+
+    grad_buf.create(I0.size(), CV_32FC1);
+    rho_c_buf.create(I0.size(), CV_32FC1);
+
+    p11_buf.create(I0.size(), CV_32FC1);
+    p12_buf.create(I0.size(), CV_32FC1);
+    p21_buf.create(I0.size(), CV_32FC1);
+    p22_buf.create(I0.size(), CV_32FC1);
+
+    diff_buf.create(I0.size(), CV_32FC1);
+
+    // create the scales
+    for (int s = 1; s < nscales; ++s)
+    {
+        gpu::pyrDown(I0s[s - 1], I0s[s]);
+        gpu::pyrDown(I1s[s - 1], I1s[s]);
+
+        if (I0s[s].cols < 16 || I0s[s].rows < 16)
+        {
+            nscales = s;
+            break;
+        }
+
+        if (useInitialFlow)
+        {
+            gpu::pyrDown(u1s[s - 1], u1s[s]);
+            gpu::pyrDown(u2s[s - 1], u2s[s]);
+
+            gpu::multiply(u1s[s], Scalar::all(0.5), u1s[s]);
+            gpu::multiply(u2s[s], Scalar::all(0.5), u2s[s]);
+        }
+    }
+
+    // pyramidal structure for computing the optical flow
+    for (int s = nscales - 1; s >= 0; --s)
+    {
+        // compute the optical flow at the current scale
+        procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]);
+
+        // if this was the last scale, finish now
+        if (s == 0)
+            break;
+
+        // otherwise, upsample the optical flow
+
+        // zoom the optical flow for the next finer scale
+        gpu::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
+        gpu::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
+
+        // scale the optical flow with the appropriate zoom factor
+        gpu::multiply(u1s[s - 1], Scalar::all(2), u1s[s - 1]);
+        gpu::multiply(u2s[s - 1], Scalar::all(2), u2s[s - 1]);
+    }
+}
+
+namespace tvl1flow
+{
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy);
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho);
+    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
+                   PtrStepSzf grad, PtrStepSzf rho_c,
+                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22,
+                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error,
+                   float l_t, float theta);
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut);
+}
+
+void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2)
+{
+    using namespace tvl1flow;
+
+    const double scaledEpsilon = epsilon * epsilon * I0.size().area();
+
+    CV_DbgAssert( I1.size() == I0.size() );
+    CV_DbgAssert( I1.type() == I0.type() );
+    CV_DbgAssert( u1.empty() || u1.size() == I0.size() );
+    CV_DbgAssert( u2.size() == u1.size() );
+
+    if (u1.empty())
+    {
+        u1.create(I0.size(), CV_32FC1);
+        u1.setTo(Scalar::all(0));
+
+        u2.create(I0.size(), CV_32FC1);
+        u2.setTo(Scalar::all(0));
+    }
+
+    GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
+    centeredGradient(I1, I1x, I1y);
+
+    GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
+    p11.setTo(Scalar::all(0));
+    p12.setTo(Scalar::all(0));
+    p21.setTo(Scalar::all(0));
+    p22.setTo(Scalar::all(0));
+
+    GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    const float l_t = static_cast<float>(lambda * theta);
+    const float taut = static_cast<float>(tau / theta);
+
+    for (int warpings = 0; warpings < warps; ++warpings)
+    {
+        warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);
+
+        double error = numeric_limits<double>::max();
+        for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
+        {
+            estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, diff, l_t, static_cast<float>(theta));
+
+            error = gpu::sum(diff, norm_buf)[0];
+
+            estimateDualVariables(u1, u2, p11, p12, p21, p22, taut);
+        }
+    }
+}
+
+void cv::gpu::OpticalFlowDual_TVL1_GPU::collectGarbage()
+{
+    I0s.clear();
+    I1s.clear();
+    u1s.clear();
+    u2s.clear();
+
+    I1x_buf.release();
+    I1y_buf.release();
+
+    I1w_buf.release();
+    I1wx_buf.release();
+    I1wy_buf.release();
+
+    grad_buf.release();
+    rho_c_buf.release();
+
+    p11_buf.release();
+    p12_buf.release();
+    p21_buf.release();
+    p22_buf.release();
+
+    diff_buf.release();
+    norm_buf.release();
+}
+
+#endif // !defined HAVE_CUDA || defined(CUDA_DISABLER)
diff --git a/modules/gpu/src/video_decoder.cpp b/modules/gpu/src/video_decoder.cpp
index 41e1d26078..c1e8f86a4c 100644
--- a/modules/gpu/src/video_decoder.cpp
+++ b/modules/gpu/src/video_decoder.cpp
@@ -43,7 +43,7 @@
 #include "video_decoder.h"
 #include "frame_queue.h"
 
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
 
 void cv::gpu::detail::VideoDecoder::create(const VideoReader_GPU::FormatInfo& videoFormat)
 {
diff --git a/modules/gpu/src/video_decoder.h b/modules/gpu/src/video_decoder.h
index 9c4dfa9294..634643b889 100644
--- a/modules/gpu/src/video_decoder.h
+++ b/modules/gpu/src/video_decoder.h
@@ -46,7 +46,7 @@
 #include "precomp.hpp"
 #include "cu_safe_call.h"
 
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
 
 namespace cv { namespace gpu
 {
diff --git a/modules/gpu/src/video_parser.cpp b/modules/gpu/src/video_parser.cpp
index 2ff14bd75a..70187fcb2f 100644
--- a/modules/gpu/src/video_parser.cpp
+++ b/modules/gpu/src/video_parser.cpp
@@ -43,7 +43,7 @@
 #include "video_parser.h"
 #include "cu_safe_call.h"
 
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
 
 cv::gpu::detail::VideoParser::VideoParser(VideoDecoder* videoDecoder, FrameQueue* frameQueue) :
     videoDecoder_(videoDecoder), frameQueue_(frameQueue), unparsedPackets_(0), hasError_(false)
diff --git a/modules/gpu/src/video_parser.h b/modules/gpu/src/video_parser.h
index 3935ffcefe..44a53710bf 100644
--- a/modules/gpu/src/video_parser.h
+++ b/modules/gpu/src/video_parser.h
@@ -48,7 +48,7 @@
 #include "frame_queue.h"
 #include "video_decoder.h"
 
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
 
 namespace cv { namespace gpu
 {
diff --git a/modules/gpu/src/video_reader.cpp b/modules/gpu/src/video_reader.cpp
index df3c2ff2f0..3224902c6d 100644
--- a/modules/gpu/src/video_reader.cpp
+++ b/modules/gpu/src/video_reader.cpp
@@ -42,7 +42,7 @@
 
 #include "precomp.hpp"
 
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER) || defined(__APPLE__)
+#if !defined(HAVE_CUDA) || defined(CUDA_DISABLER) || !defined(HAVE_NVCUVID)
 
 class cv::gpu::VideoReader_GPU::Impl
 {
diff --git a/modules/gpu/src/video_writer.cpp b/modules/gpu/src/video_writer.cpp
index 617c1eabc6..dd4b8fb039 100644
--- a/modules/gpu/src/video_writer.cpp
+++ b/modules/gpu/src/video_writer.cpp
@@ -42,7 +42,7 @@
 
 #include "precomp.hpp"
 
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER) || !defined(WIN32)
+#if !defined(HAVE_CUDA) || defined(CUDA_DISABLER) || !defined(HAVE_NVCUVID) || !defined(WIN32)
 
 class cv::gpu::VideoWriter_GPU::Impl
 {
@@ -1001,4 +1001,4 @@ void cv::gpu::VideoWriter_GPU::EncoderParams::save(const std::string& configFile
     cv::write(fs, "DisableSPSPPS"  , DisableSPSPPS);
 }
 
-#endif // !defined HAVE_CUDA || !defined WIN32
\ No newline at end of file
+#endif // !defined HAVE_CUDA || !defined WIN32
diff --git a/modules/gpu/src/warp.cpp b/modules/gpu/src/warp.cpp
index 2522c5a7b4..0fb19addaa 100644
--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp
@@ -61,13 +61,13 @@ namespace cv { namespace gpu { namespace device
 
         template <typename T>
         void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-                            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
         void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);
 
         template <typename T>
         void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
-                            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
     }
 }}}
 
@@ -143,33 +143,31 @@ namespace
     {
         typedef typename NppWarpFunc<DEPTH>::npp_t npp_t;
 
-        static void call(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst,
-                         double coeffs[][3], cv::Size dsize, int interpolation, cudaStream_t stream)
+        static void call(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream)
         {
             static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
 
-            dst.create(dsize, src.type());
-            dst.setTo(cv::Scalar::all(0));
-
             NppiSize srcsz;
-            srcsz.height = wholeSize.height;
-            srcsz.width = wholeSize.width;
+            srcsz.height = src.rows;
+            srcsz.width = src.cols;
 
             NppiRect srcroi;
-            srcroi.x = ofs.x;
-            srcroi.y = ofs.y;
+            srcroi.x = 0;
+            srcroi.y = 0;
             srcroi.height = src.rows;
             srcroi.width = src.cols;
 
             NppiRect dstroi;
-            dstroi.x = dstroi.y = 0;
+            dstroi.x = 0;
+            dstroi.y = 0;
             dstroi.height = dst.rows;
             dstroi.width = dst.cols;
 
             cv::gpu::NppStreamHandler h(stream);
 
-            nppSafeCall( func((npp_t*)src.datastart, srcsz, static_cast<int>(src.step), srcroi,
-                              dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi, coeffs, npp_inter[interpolation]) );
+            nppSafeCall( func(src.ptr<npp_t>(), srcsz, static_cast<int>(src.step), srcroi,
+                              dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi,
+                              coeffs, npp_inter[interpolation]) );
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -187,6 +185,8 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
     CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
     CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
 
+    dst.create(dsize, src.type());
+
     Size wholeSize;
     Point ofs;
     src.locateROI(wholeSize, ofs);
@@ -231,8 +231,7 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
         }
     };
 
-    bool useNpp = borderMode == BORDER_CONSTANT;
-    useNpp = useNpp && useNppTab[src.depth()][src.channels() - 1][interpolation];
+    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
     #ifdef linux
         // NPP bug on float data
         useNpp = useNpp && src.depth() != CV_32F;
@@ -240,7 +239,7 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
 
     if (useNpp)
     {
-        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst, double coeffs[][3], cv::Size dsize, int flags, cudaStream_t stream);
+        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
 
         static const func_t funcs[2][6][4] =
         {
@@ -262,6 +261,8 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
             }
         };
 
+        dst.setTo(borderValue);
+
         double coeffs[2][3];
         Mat coeffsMat(2, 3, CV_64F, (void*)coeffs);
         M.convertTo(coeffsMat, coeffsMat.type());
@@ -269,14 +270,14 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
         const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
         CV_Assert(func != 0);
 
-        func(src, wholeSize, ofs, dst, coeffs, dsize, interpolation, StreamAccessor::getStream(s));
+        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
     }
     else
     {
         using namespace cv::gpu::device::imgproc;
 
         typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
         static const func_t funcs[6][4] =
         {
@@ -294,8 +295,6 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
         int gpuBorderType;
         CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
 
-        dst.create(dsize, src.type());
-
         float coeffs[2 * 3];
         Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
 
@@ -311,11 +310,8 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
         Scalar_<float> borderValueFloat;
         borderValueFloat = borderValue;
 
-        DeviceInfo info;
-        int cc = info.majorVersion() * 10 + info.minorVersion();
-
         func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
-            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc);
+            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
     }
 }
 
@@ -329,6 +325,8 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
     CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
     CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
 
+    dst.create(dsize, src.type());
+
     Size wholeSize;
     Point ofs;
     src.locateROI(wholeSize, ofs);
@@ -373,8 +371,7 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
         }
     };
 
-    bool useNpp = borderMode == BORDER_CONSTANT;
-    useNpp = useNpp && useNppTab[src.depth()][src.channels() - 1][interpolation];
+    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
     #ifdef linux
         // NPP bug on float data
         useNpp = useNpp && src.depth() != CV_32F;
@@ -382,7 +379,7 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
 
     if (useNpp)
     {
-        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst, double coeffs[][3], cv::Size dsize, int flags, cudaStream_t stream);
+        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
 
         static const func_t funcs[2][6][4] =
         {
@@ -404,6 +401,8 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
             }
         };
 
+        dst.setTo(borderValue);
+
         double coeffs[3][3];
         Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
         M.convertTo(coeffsMat, coeffsMat.type());
@@ -411,14 +410,14 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
         const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
         CV_Assert(func != 0);
 
-        func(src, wholeSize, ofs, dst, coeffs, dsize, interpolation, StreamAccessor::getStream(s));
+        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
     }
     else
     {
         using namespace cv::gpu::device::imgproc;
 
         typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
         static const func_t funcs[6][4] =
         {
@@ -436,8 +435,6 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
         int gpuBorderType;
         CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
 
-        dst.create(dsize, src.type());
-
         float coeffs[3 * 3];
         Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
 
@@ -453,11 +450,8 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
         Scalar_<float> borderValueFloat;
         borderValueFloat = borderValue;
 
-        DeviceInfo info;
-        int cc = info.majorVersion() * 10 + info.minorVersion();
-
         func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
-            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc);
+            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
     }
 }
 
diff --git a/modules/gpu/test/interpolation.hpp b/modules/gpu/test/interpolation.hpp
index f8aed1a389..8e723c5c08 100644
--- a/modules/gpu/test/interpolation.hpp
+++ b/modules/gpu/test/interpolation.hpp
@@ -42,6 +42,9 @@
 #ifndef __OPENCV_TEST_INTERPOLATION_HPP__
 #define __OPENCV_TEST_INTERPOLATION_HPP__
 
+#include "opencv2/core/core.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+
 template <typename T> T readVal(const cv::Mat& src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
 {
     if (border_type == cv::BORDER_CONSTANT)
@@ -113,7 +116,7 @@ template <typename T> struct CubicInterpolator
             for (float cx = xmin; cx <= xmax; cx += 1.0f)
             {
                 const float w = bicubicCoeff(x - cx) * bicubicCoeff(y - cy);
-                sum += w * readVal<T>(src, cvFloor(cy), cvFloor(cx), c, border_type, borderVal);
+                sum += w * readVal<T>(src, (int) floorf(cy), (int) floorf(cx), c, border_type, borderVal);
                 wsum += w;
             }
         }
diff --git a/modules/gpu/test/nvidia/TestHaarCascadeApplication.cpp b/modules/gpu/test/nvidia/TestHaarCascadeApplication.cpp
index 99913b8859..dd410cd515 100644
--- a/modules/gpu/test/nvidia/TestHaarCascadeApplication.cpp
+++ b/modules/gpu/test/nvidia/TestHaarCascadeApplication.cpp
@@ -210,6 +210,18 @@ bool TestHaarCascadeApplication::process()
 #if defined(__GNUC__)
     //http://www.christian-seiler.de/projekte/fpmath/
 
+    #ifndef _FPU_EXTENDED
+    #define _FPU_EXTENDED 0
+    #endif
+
+    #ifndef _FPU_DOUBLE
+    #define _FPU_DOUBLE 0
+    #endif
+
+    #ifndef _FPU_SINGLE
+    #define _FPU_SINGLE 0
+    #endif
+
     fpu_control_t fpu_oldcw, fpu_cw;
     _FPU_GETCW(fpu_oldcw); // store old cw
      fpu_cw = (fpu_oldcw & ~_FPU_EXTENDED & ~_FPU_DOUBLE & ~_FPU_SINGLE) | _FPU_SINGLE;
@@ -302,4 +314,4 @@ bool TestHaarCascadeApplication::deinit()
     return true;
 }
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/test/nvidia/main_nvidia.cpp b/modules/gpu/test/nvidia/main_nvidia.cpp
index 7873563ce1..43c92ce1ee 100644
--- a/modules/gpu/test/nvidia/main_nvidia.cpp
+++ b/modules/gpu/test/nvidia/main_nvidia.cpp
@@ -276,6 +276,8 @@ static void devNullOutput(const std::string& msg)
     (void)msg;
 }
 
+}
+
 bool nvidia_NPPST_Integral_Image(const std::string& test_data_path, OutputLevel outputLevel)
 {
     path = test_data_path.c_str();
@@ -292,8 +294,6 @@ bool nvidia_NPPST_Integral_Image(const std::string& test_data_path, OutputLevel
     return testListerII.invoke();
 }
 
-}
-
 bool nvidia_NPPST_Squared_Integral_Image(const std::string& test_data_path, OutputLevel outputLevel)
 {
     path = test_data_path;
@@ -439,4 +439,4 @@ bool nvidia_NCV_Visualization(const std::string& test_data_path, OutputLevel out
     return testListerVisualize.invoke();
 }
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/test/test_bgfg.cpp b/modules/gpu/test/test_bgfg.cpp
new file mode 100644
index 0000000000..bac835ef1b
--- /dev/null
+++ b/modules/gpu/test/test_bgfg.cpp
@@ -0,0 +1,405 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+//////////////////////////////////////////////////////
+// FGDStatModel
+
+namespace cv
+{
+    template<> void Ptr<CvBGStatModel>::delete_obj()
+    {
+        cvReleaseBGStatModel(&obj);
+    }
+}
+
+PARAM_TEST_CASE(FGDStatModel, cv::gpu::DeviceInfo, std::string, Channels)
+{
+    cv::gpu::DeviceInfo devInfo;
+    std::string inputFile;
+    int out_cn;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
+
+        out_cn = GET_PARAM(2);
+    }
+};
+
+GPU_TEST_P(FGDStatModel, Update)
+{
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+    cap >> frame;
+    ASSERT_FALSE(frame.empty());
+
+    IplImage ipl_frame = frame;
+    cv::Ptr<CvBGStatModel> model(cvCreateFGDStatModel(&ipl_frame));
+
+    cv::gpu::GpuMat d_frame(frame);
+    cv::gpu::FGDStatModel d_model(out_cn);
+    d_model.create(d_frame);
+
+    cv::Mat h_background;
+    cv::Mat h_foreground;
+    cv::Mat h_background3;
+
+    cv::Mat backgroundDiff;
+    cv::Mat foregroundDiff;
+
+    for (int i = 0; i < 5; ++i)
+    {
+        cap >> frame;
+        ASSERT_FALSE(frame.empty());
+
+        ipl_frame = frame;
+        int gold_count = cvUpdateBGStatModel(&ipl_frame, model);
+
+        d_frame.upload(frame);
+
+        int count = d_model.update(d_frame);
+
+        ASSERT_EQ(gold_count, count);
+
+        cv::Mat gold_background(model->background);
+        cv::Mat gold_foreground(model->foreground);
+
+        if (out_cn == 3)
+            d_model.background.download(h_background3);
+        else
+        {
+            d_model.background.download(h_background);
+            cv::cvtColor(h_background, h_background3, cv::COLOR_BGRA2BGR);
+        }
+        d_model.foreground.download(h_foreground);
+
+        ASSERT_MAT_NEAR(gold_background, h_background3, 1.0);
+        ASSERT_MAT_NEAR(gold_foreground, h_foreground, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, FGDStatModel, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::string("768x576.avi")),
+    testing::Values(Channels(3), Channels(4))));
+
+//////////////////////////////////////////////////////
+// MOG
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(UseGray, bool)
+    IMPLEMENT_PARAM_CLASS(LearningRate, double)
+}
+
+PARAM_TEST_CASE(MOG, cv::gpu::DeviceInfo, std::string, UseGray, LearningRate, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    std::string inputFile;
+    bool useGray;
+    double learningRate;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
+
+        useGray = GET_PARAM(2);
+
+        learningRate = GET_PARAM(3);
+
+        useRoi = GET_PARAM(4);
+    }
+};
+
+GPU_TEST_P(MOG, Update)
+{
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+    cap >> frame;
+    ASSERT_FALSE(frame.empty());
+
+    cv::gpu::MOG_GPU mog;
+    cv::gpu::GpuMat foreground = createMat(frame.size(), CV_8UC1, useRoi);
+
+    cv::BackgroundSubtractorMOG mog_gold;
+    cv::Mat foreground_gold;
+
+    for (int i = 0; i < 10; ++i)
+    {
+        cap >> frame;
+        ASSERT_FALSE(frame.empty());
+
+        if (useGray)
+        {
+            cv::Mat temp;
+            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+            cv::swap(temp, frame);
+        }
+
+        mog(loadMat(frame, useRoi), foreground, (float)learningRate);
+
+        mog_gold(frame, foreground_gold, learningRate);
+
+        ASSERT_MAT_NEAR(foreground_gold, foreground, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, MOG, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::string("768x576.avi")),
+    testing::Values(UseGray(true), UseGray(false)),
+    testing::Values(LearningRate(0.0), LearningRate(0.01)),
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////
+// MOG2
+
+PARAM_TEST_CASE(MOG2, cv::gpu::DeviceInfo, std::string, UseGray, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    std::string inputFile;
+    bool useGray;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
+
+        useGray = GET_PARAM(2);
+
+        useRoi = GET_PARAM(3);
+    }
+};
+
+GPU_TEST_P(MOG2, Update)
+{
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+    cap >> frame;
+    ASSERT_FALSE(frame.empty());
+
+    cv::gpu::MOG2_GPU mog2;
+    cv::gpu::GpuMat foreground = createMat(frame.size(), CV_8UC1, useRoi);
+
+    cv::BackgroundSubtractorMOG2 mog2_gold;
+    cv::Mat foreground_gold;
+
+    for (int i = 0; i < 10; ++i)
+    {
+        cap >> frame;
+        ASSERT_FALSE(frame.empty());
+
+        if (useGray)
+        {
+            cv::Mat temp;
+            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+            cv::swap(temp, frame);
+        }
+
+        mog2(loadMat(frame, useRoi), foreground);
+
+        mog2_gold(frame, foreground_gold);
+
+        double norm = cv::norm(foreground_gold, cv::Mat(foreground), cv::NORM_L1);
+
+        norm /= foreground_gold.size().area();
+
+        ASSERT_LE(norm, 0.09);
+    }
+}
+
+GPU_TEST_P(MOG2, getBackgroundImage)
+{
+    if (useGray)
+        return;
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+
+    cv::gpu::MOG2_GPU mog2;
+    cv::gpu::GpuMat foreground;
+
+    cv::BackgroundSubtractorMOG2 mog2_gold;
+    cv::Mat foreground_gold;
+
+    for (int i = 0; i < 10; ++i)
+    {
+        cap >> frame;
+        ASSERT_FALSE(frame.empty());
+
+        mog2(loadMat(frame, useRoi), foreground);
+
+        mog2_gold(frame, foreground_gold);
+    }
+
+    cv::gpu::GpuMat background = createMat(frame.size(), frame.type(), useRoi);
+    mog2.getBackgroundImage(background);
+
+    cv::Mat background_gold;
+    mog2_gold.getBackgroundImage(background_gold);
+
+    ASSERT_MAT_NEAR(background_gold, background, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, MOG2, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::string("768x576.avi")),
+    testing::Values(UseGray(true), UseGray(false)),
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////
+// VIBE
+
+PARAM_TEST_CASE(VIBE, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+};
+
+GPU_TEST_P(VIBE, Accuracy)
+{
+    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+    const cv::Size size = GET_PARAM(1);
+    const int type = GET_PARAM(2);
+    const bool useRoi = GET_PARAM(3);
+
+    const cv::Mat fullfg(size, CV_8UC1, cv::Scalar::all(255));
+
+    cv::Mat frame = randomMat(size, type, 0.0, 100);
+    cv::gpu::GpuMat d_frame = loadMat(frame, useRoi);
+
+    cv::gpu::VIBE_GPU vibe;
+    cv::gpu::GpuMat d_fgmask = createMat(size, CV_8UC1, useRoi);
+    vibe.initialize(d_frame);
+
+    for (int i = 0; i < 20; ++i)
+        vibe(d_frame, d_fgmask);
+
+    frame = randomMat(size, type, 160, 255);
+    d_frame = loadMat(frame, useRoi);
+    vibe(d_frame, d_fgmask);
+
+    // now fgmask should be entirely foreground
+    ASSERT_MAT_NEAR(fullfg, d_fgmask, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, VIBE, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4)),
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////
+// GMG
+
+PARAM_TEST_CASE(GMG, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, UseRoi)
+{
+};
+
+GPU_TEST_P(GMG, Accuracy)
+{
+    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+    const cv::Size size = GET_PARAM(1);
+    const int depth = GET_PARAM(2);
+    const int channels = GET_PARAM(3);
+    const bool useRoi = GET_PARAM(4);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    const cv::Mat zeros(size, CV_8UC1, cv::Scalar::all(0));
+    const cv::Mat fullfg(size, CV_8UC1, cv::Scalar::all(255));
+
+    cv::Mat frame = randomMat(size, type, 0, 100);
+    cv::gpu::GpuMat d_frame = loadMat(frame, useRoi);
+
+    cv::gpu::GMG_GPU gmg;
+    gmg.numInitializationFrames = 5;
+    gmg.smoothingRadius = 0;
+    gmg.initialize(d_frame.size(), 0, 255);
+
+    cv::gpu::GpuMat d_fgmask = createMat(size, CV_8UC1, useRoi);
+
+    for (int i = 0; i < gmg.numInitializationFrames; ++i)
+    {
+        gmg(d_frame, d_fgmask);
+
+        // fgmask should be entirely background during training
+        ASSERT_MAT_NEAR(zeros, d_fgmask, 0);
+    }
+
+    frame = randomMat(size, type, 160, 255);
+    d_frame = loadMat(frame, useRoi);
+    gmg(d_frame, d_fgmask);
+
+    // now fgmask should be entirely foreground
+    ASSERT_MAT_NEAR(fullfg, d_fgmask, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, GMG, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8U), MatType(CV_16U), MatType(CV_32F)),
+    testing::Values(Channels(1), Channels(3), Channels(4)),
+    WHOLE_SUBMAT));
+
+#endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_calib3d.cpp b/modules/gpu/test/test_calib3d.cpp
index a294a3df95..318de8d898 100644
--- a/modules/gpu/test/test_calib3d.cpp
+++ b/modules/gpu/test/test_calib3d.cpp
@@ -43,8 +43,6 @@
 
 #ifdef HAVE_CUDA
 
-namespace {
-
 //////////////////////////////////////////////////////////////////////////
 // StereoBM
 
@@ -60,7 +58,7 @@ struct StereoBM : testing::TestWithParam<cv::gpu::DeviceInfo>
     }
 };
 
-TEST_P(StereoBM, Regression)
+GPU_TEST_P(StereoBM, Regression)
 {
     cv::Mat left_image  = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
     cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
@@ -95,7 +93,7 @@ struct StereoBeliefPropagation : testing::TestWithParam<cv::gpu::DeviceInfo>
     }
 };
 
-TEST_P(StereoBeliefPropagation, Regression)
+GPU_TEST_P(StereoBeliefPropagation, Regression)
 {
     cv::Mat left_image  = readImage("stereobp/aloe-L.png");
     cv::Mat right_image = readImage("stereobp/aloe-R.png");
@@ -133,7 +131,7 @@ struct StereoConstantSpaceBP : testing::TestWithParam<cv::gpu::DeviceInfo>
     }
 };
 
-TEST_P(StereoConstantSpaceBP, Regression)
+GPU_TEST_P(StereoConstantSpaceBP, Regression)
 {
     cv::Mat left_image  = readImage("csstereobp/aloe-L.png");
     cv::Mat right_image = readImage("csstereobp/aloe-R.png");
@@ -177,7 +175,7 @@ struct TransformPoints : testing::TestWithParam<cv::gpu::DeviceInfo>
     }
 };
 
-TEST_P(TransformPoints, Accuracy)
+GPU_TEST_P(TransformPoints, Accuracy)
 {
     cv::Mat src = randomMat(cv::Size(1000, 1), CV_32FC3, 0, 10);
     cv::Mat rvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
@@ -225,7 +223,7 @@ struct ProjectPoints : testing::TestWithParam<cv::gpu::DeviceInfo>
     }
 };
 
-TEST_P(ProjectPoints, Accuracy)
+GPU_TEST_P(ProjectPoints, Accuracy)
 {
     cv::Mat src = randomMat(cv::Size(1000, 1), CV_32FC3, 0, 10);
     cv::Mat rvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
@@ -275,7 +273,7 @@ struct SolvePnPRansac : testing::TestWithParam<cv::gpu::DeviceInfo>
     }
 };
 
-TEST_P(SolvePnPRansac, Accuracy)
+GPU_TEST_P(SolvePnPRansac, Accuracy)
 {
     cv::Mat object = randomMat(cv::Size(5000, 1), CV_32FC3, 0, 100);
     cv::Mat camera_mat = randomMat(cv::Size(3, 3), CV_32F, 0.5, 1);
@@ -324,7 +322,7 @@ PARAM_TEST_CASE(ReprojectImageTo3D, cv::gpu::DeviceInfo, cv::Size, MatDepth, Use
     }
 };
 
-TEST_P(ReprojectImageTo3D, Accuracy)
+GPU_TEST_P(ReprojectImageTo3D, Accuracy)
 {
     cv::Mat disp = randomMat(size, depth, 5.0, 30.0);
     cv::Mat Q = randomMat(cv::Size(4, 4), CV_32FC1, 0.1, 1.0);
@@ -344,6 +342,4 @@ INSTANTIATE_TEST_CASE_P(GPU_Calib3D, ReprojectImageTo3D, testing::Combine(
     testing::Values(MatDepth(CV_8U), MatDepth(CV_16S)),
     WHOLE_SUBMAT));
 
-} // namespace
-
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_color.cpp b/modules/gpu/test/test_color.cpp
index 2510f564d0..e30dcfdba3 100644
--- a/modules/gpu/test/test_color.cpp
+++ b/modules/gpu/test/test_color.cpp
@@ -43,8 +43,6 @@
 
 #ifdef HAVE_CUDA
 
-namespace {
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // cvtColor
 
@@ -70,7 +68,7 @@ PARAM_TEST_CASE(CvtColor, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
     }
 };
 
-TEST_P(CvtColor, BGR2RGB)
+GPU_TEST_P(CvtColor, BGR2RGB)
 {
     cv::Mat src = img;
 
@@ -83,7 +81,7 @@ TEST_P(CvtColor, BGR2RGB)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR2RGBA)
+GPU_TEST_P(CvtColor, BGR2RGBA)
 {
     cv::Mat src = img;
 
@@ -96,7 +94,7 @@ TEST_P(CvtColor, BGR2RGBA)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR2BGRA)
+GPU_TEST_P(CvtColor, BGR2BGRA)
 {
     cv::Mat src = img;
 
@@ -109,7 +107,7 @@ TEST_P(CvtColor, BGR2BGRA)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGRA2RGB)
+GPU_TEST_P(CvtColor, BGRA2RGB)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
@@ -123,7 +121,7 @@ TEST_P(CvtColor, BGRA2RGB)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGRA2BGR)
+GPU_TEST_P(CvtColor, BGRA2BGR)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
@@ -137,7 +135,7 @@ TEST_P(CvtColor, BGRA2BGR)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGRA2RGBA)
+GPU_TEST_P(CvtColor, BGRA2RGBA)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
@@ -151,7 +149,7 @@ TEST_P(CvtColor, BGRA2RGBA)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR2GRAY)
+GPU_TEST_P(CvtColor, BGR2GRAY)
 {
     cv::Mat src = img;
 
@@ -164,7 +162,7 @@ TEST_P(CvtColor, BGR2GRAY)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, RGB2GRAY)
+GPU_TEST_P(CvtColor, RGB2GRAY)
 {
     cv::Mat src = img;
 
@@ -177,7 +175,7 @@ TEST_P(CvtColor, RGB2GRAY)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, GRAY2BGR)
+GPU_TEST_P(CvtColor, GRAY2BGR)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2GRAY);
@@ -191,7 +189,7 @@ TEST_P(CvtColor, GRAY2BGR)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, GRAY2BGRA)
+GPU_TEST_P(CvtColor, GRAY2BGRA)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2GRAY);
@@ -205,7 +203,7 @@ TEST_P(CvtColor, GRAY2BGRA)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGRA2GRAY)
+GPU_TEST_P(CvtColor, BGRA2GRAY)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
@@ -219,7 +217,7 @@ TEST_P(CvtColor, BGRA2GRAY)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, RGBA2GRAY)
+GPU_TEST_P(CvtColor, RGBA2GRAY)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
@@ -233,7 +231,7 @@ TEST_P(CvtColor, RGBA2GRAY)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, BGR2BGR565)
+GPU_TEST_P(CvtColor, BGR2BGR565)
 {
     if (depth != CV_8U)
         return;
@@ -249,7 +247,7 @@ TEST_P(CvtColor, BGR2BGR565)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, RGB2BGR565)
+GPU_TEST_P(CvtColor, RGB2BGR565)
 {
     if (depth != CV_8U)
         return;
@@ -265,7 +263,7 @@ TEST_P(CvtColor, RGB2BGR565)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR5652BGR)
+GPU_TEST_P(CvtColor, BGR5652BGR)
 {
     if (depth != CV_8U)
         return;
@@ -282,7 +280,7 @@ TEST_P(CvtColor, BGR5652BGR)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR5652RGB)
+GPU_TEST_P(CvtColor, BGR5652RGB)
 {
     if (depth != CV_8U)
         return;
@@ -299,7 +297,7 @@ TEST_P(CvtColor, BGR5652RGB)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGRA2BGR565)
+GPU_TEST_P(CvtColor, BGRA2BGR565)
 {
     if (depth != CV_8U)
         return;
@@ -316,7 +314,7 @@ TEST_P(CvtColor, BGRA2BGR565)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, RGBA2BGR565)
+GPU_TEST_P(CvtColor, RGBA2BGR565)
 {
     if (depth != CV_8U)
         return;
@@ -333,7 +331,7 @@ TEST_P(CvtColor, RGBA2BGR565)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR5652BGRA)
+GPU_TEST_P(CvtColor, BGR5652BGRA)
 {
     if (depth != CV_8U)
         return;
@@ -350,7 +348,7 @@ TEST_P(CvtColor, BGR5652BGRA)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR5652RGBA)
+GPU_TEST_P(CvtColor, BGR5652RGBA)
 {
     if (depth != CV_8U)
         return;
@@ -367,7 +365,7 @@ TEST_P(CvtColor, BGR5652RGBA)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, GRAY2BGR565)
+GPU_TEST_P(CvtColor, GRAY2BGR565)
 {
     if (depth != CV_8U)
         return;
@@ -384,7 +382,7 @@ TEST_P(CvtColor, GRAY2BGR565)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR5652GRAY)
+GPU_TEST_P(CvtColor, BGR5652GRAY)
 {
     if (depth != CV_8U)
         return;
@@ -401,7 +399,7 @@ TEST_P(CvtColor, BGR5652GRAY)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR2BGR555)
+GPU_TEST_P(CvtColor, BGR2BGR555)
 {
     if (depth != CV_8U)
         return;
@@ -417,7 +415,7 @@ TEST_P(CvtColor, BGR2BGR555)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, RGB2BGR555)
+GPU_TEST_P(CvtColor, RGB2BGR555)
 {
     if (depth != CV_8U)
         return;
@@ -433,7 +431,7 @@ TEST_P(CvtColor, RGB2BGR555)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR5552BGR)
+GPU_TEST_P(CvtColor, BGR5552BGR)
 {
     if (depth != CV_8U)
         return;
@@ -450,7 +448,7 @@ TEST_P(CvtColor, BGR5552BGR)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR5552RGB)
+GPU_TEST_P(CvtColor, BGR5552RGB)
 {
     if (depth != CV_8U)
         return;
@@ -467,7 +465,7 @@ TEST_P(CvtColor, BGR5552RGB)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGRA2BGR555)
+GPU_TEST_P(CvtColor, BGRA2BGR555)
 {
     if (depth != CV_8U)
         return;
@@ -484,7 +482,7 @@ TEST_P(CvtColor, BGRA2BGR555)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, RGBA2BGR555)
+GPU_TEST_P(CvtColor, RGBA2BGR555)
 {
     if (depth != CV_8U)
         return;
@@ -501,7 +499,7 @@ TEST_P(CvtColor, RGBA2BGR555)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR5552BGRA)
+GPU_TEST_P(CvtColor, BGR5552BGRA)
 {
     if (depth != CV_8U)
         return;
@@ -518,7 +516,7 @@ TEST_P(CvtColor, BGR5552BGRA)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR5552RGBA)
+GPU_TEST_P(CvtColor, BGR5552RGBA)
 {
     if (depth != CV_8U)
         return;
@@ -535,7 +533,7 @@ TEST_P(CvtColor, BGR5552RGBA)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, GRAY2BGR555)
+GPU_TEST_P(CvtColor, GRAY2BGR555)
 {
     if (depth != CV_8U)
         return;
@@ -552,7 +550,7 @@ TEST_P(CvtColor, GRAY2BGR555)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR5552GRAY)
+GPU_TEST_P(CvtColor, BGR5552GRAY)
 {
     if (depth != CV_8U)
         return;
@@ -569,7 +567,7 @@ TEST_P(CvtColor, BGR5552GRAY)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(CvtColor, BGR2XYZ)
+GPU_TEST_P(CvtColor, BGR2XYZ)
 {
     cv::Mat src = img;
 
@@ -582,7 +580,7 @@ TEST_P(CvtColor, BGR2XYZ)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, RGB2XYZ)
+GPU_TEST_P(CvtColor, RGB2XYZ)
 {
     cv::Mat src = img;
 
@@ -595,7 +593,7 @@ TEST_P(CvtColor, RGB2XYZ)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, BGR2XYZ4)
+GPU_TEST_P(CvtColor, BGR2XYZ4)
 {
     cv::Mat src = img;
 
@@ -616,7 +614,7 @@ TEST_P(CvtColor, BGR2XYZ4)
     EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
 }
 
-TEST_P(CvtColor, BGRA2XYZ4)
+GPU_TEST_P(CvtColor, BGRA2XYZ4)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
@@ -638,7 +636,7 @@ TEST_P(CvtColor, BGRA2XYZ4)
     EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
 }
 
-TEST_P(CvtColor, XYZ2BGR)
+GPU_TEST_P(CvtColor, XYZ2BGR)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
@@ -652,7 +650,7 @@ TEST_P(CvtColor, XYZ2BGR)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, XYZ2RGB)
+GPU_TEST_P(CvtColor, XYZ2RGB)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
@@ -666,7 +664,7 @@ TEST_P(CvtColor, XYZ2RGB)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, XYZ42BGR)
+GPU_TEST_P(CvtColor, XYZ42BGR)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
@@ -685,7 +683,7 @@ TEST_P(CvtColor, XYZ42BGR)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, XYZ42BGRA)
+GPU_TEST_P(CvtColor, XYZ42BGRA)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
@@ -704,7 +702,7 @@ TEST_P(CvtColor, XYZ42BGRA)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, BGR2YCrCb)
+GPU_TEST_P(CvtColor, BGR2YCrCb)
 {
     cv::Mat src = img;
 
@@ -717,7 +715,7 @@ TEST_P(CvtColor, BGR2YCrCb)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, RGB2YCrCb)
+GPU_TEST_P(CvtColor, RGB2YCrCb)
 {
     cv::Mat src = img;
 
@@ -730,7 +728,7 @@ TEST_P(CvtColor, RGB2YCrCb)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, BGR2YCrCb4)
+GPU_TEST_P(CvtColor, BGR2YCrCb4)
 {
     cv::Mat src = img;
 
@@ -751,7 +749,7 @@ TEST_P(CvtColor, BGR2YCrCb4)
     EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
 }
 
-TEST_P(CvtColor, RGBA2YCrCb4)
+GPU_TEST_P(CvtColor, RGBA2YCrCb4)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
@@ -773,7 +771,7 @@ TEST_P(CvtColor, RGBA2YCrCb4)
     EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
 }
 
-TEST_P(CvtColor, YCrCb2BGR)
+GPU_TEST_P(CvtColor, YCrCb2BGR)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
@@ -787,7 +785,7 @@ TEST_P(CvtColor, YCrCb2BGR)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, YCrCb2RGB)
+GPU_TEST_P(CvtColor, YCrCb2RGB)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
@@ -801,7 +799,7 @@ TEST_P(CvtColor, YCrCb2RGB)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, YCrCb42RGB)
+GPU_TEST_P(CvtColor, YCrCb42RGB)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
@@ -820,7 +818,7 @@ TEST_P(CvtColor, YCrCb42RGB)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, YCrCb42RGBA)
+GPU_TEST_P(CvtColor, YCrCb42RGBA)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
@@ -839,7 +837,7 @@ TEST_P(CvtColor, YCrCb42RGBA)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, BGR2HSV)
+GPU_TEST_P(CvtColor, BGR2HSV)
 {
     if (depth == CV_16U)
         return;
@@ -855,7 +853,7 @@ TEST_P(CvtColor, BGR2HSV)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, RGB2HSV)
+GPU_TEST_P(CvtColor, RGB2HSV)
 {
     if (depth == CV_16U)
         return;
@@ -871,7 +869,7 @@ TEST_P(CvtColor, RGB2HSV)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, RGB2HSV4)
+GPU_TEST_P(CvtColor, RGB2HSV4)
 {
     if (depth == CV_16U)
         return;
@@ -895,7 +893,7 @@ TEST_P(CvtColor, RGB2HSV4)
     EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, RGBA2HSV4)
+GPU_TEST_P(CvtColor, RGBA2HSV4)
 {
     if (depth == CV_16U)
         return;
@@ -920,7 +918,7 @@ TEST_P(CvtColor, RGBA2HSV4)
     EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, BGR2HLS)
+GPU_TEST_P(CvtColor, BGR2HLS)
 {
     if (depth == CV_16U)
         return;
@@ -936,7 +934,7 @@ TEST_P(CvtColor, BGR2HLS)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, RGB2HLS)
+GPU_TEST_P(CvtColor, RGB2HLS)
 {
     if (depth == CV_16U)
         return;
@@ -952,7 +950,7 @@ TEST_P(CvtColor, RGB2HLS)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, RGB2HLS4)
+GPU_TEST_P(CvtColor, RGB2HLS4)
 {
     if (depth == CV_16U)
         return;
@@ -976,7 +974,7 @@ TEST_P(CvtColor, RGB2HLS4)
     EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, RGBA2HLS4)
+GPU_TEST_P(CvtColor, RGBA2HLS4)
 {
     if (depth == CV_16U)
         return;
@@ -1001,7 +999,7 @@ TEST_P(CvtColor, RGBA2HLS4)
     EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HSV2BGR)
+GPU_TEST_P(CvtColor, HSV2BGR)
 {
     if (depth == CV_16U)
         return;
@@ -1018,7 +1016,7 @@ TEST_P(CvtColor, HSV2BGR)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HSV2RGB)
+GPU_TEST_P(CvtColor, HSV2RGB)
 {
     if (depth == CV_16U)
         return;
@@ -1035,7 +1033,7 @@ TEST_P(CvtColor, HSV2RGB)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HSV42BGR)
+GPU_TEST_P(CvtColor, HSV42BGR)
 {
     if (depth == CV_16U)
         return;
@@ -1057,7 +1055,7 @@ TEST_P(CvtColor, HSV42BGR)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HSV42BGRA)
+GPU_TEST_P(CvtColor, HSV42BGRA)
 {
     if (depth == CV_16U)
         return;
@@ -1079,7 +1077,7 @@ TEST_P(CvtColor, HSV42BGRA)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HLS2BGR)
+GPU_TEST_P(CvtColor, HLS2BGR)
 {
     if (depth == CV_16U)
         return;
@@ -1096,7 +1094,7 @@ TEST_P(CvtColor, HLS2BGR)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HLS2RGB)
+GPU_TEST_P(CvtColor, HLS2RGB)
 {
     if (depth == CV_16U)
         return;
@@ -1113,7 +1111,7 @@ TEST_P(CvtColor, HLS2RGB)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HLS42RGB)
+GPU_TEST_P(CvtColor, HLS42RGB)
 {
     if (depth == CV_16U)
         return;
@@ -1135,7 +1133,7 @@ TEST_P(CvtColor, HLS42RGB)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HLS42RGBA)
+GPU_TEST_P(CvtColor, HLS42RGBA)
 {
     if (depth == CV_16U)
         return;
@@ -1158,7 +1156,7 @@ TEST_P(CvtColor, HLS42RGBA)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, BGR2HSV_FULL)
+GPU_TEST_P(CvtColor, BGR2HSV_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1174,7 +1172,7 @@ TEST_P(CvtColor, BGR2HSV_FULL)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, RGB2HSV_FULL)
+GPU_TEST_P(CvtColor, RGB2HSV_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1190,7 +1188,7 @@ TEST_P(CvtColor, RGB2HSV_FULL)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, RGB2HSV4_FULL)
+GPU_TEST_P(CvtColor, RGB2HSV4_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1214,7 +1212,7 @@ TEST_P(CvtColor, RGB2HSV4_FULL)
     EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, RGBA2HSV4_FULL)
+GPU_TEST_P(CvtColor, RGBA2HSV4_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1239,7 +1237,7 @@ TEST_P(CvtColor, RGBA2HSV4_FULL)
     EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, BGR2HLS_FULL)
+GPU_TEST_P(CvtColor, BGR2HLS_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1255,7 +1253,7 @@ TEST_P(CvtColor, BGR2HLS_FULL)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, RGB2HLS_FULL)
+GPU_TEST_P(CvtColor, RGB2HLS_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1271,7 +1269,7 @@ TEST_P(CvtColor, RGB2HLS_FULL)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, RGB2HLS4_FULL)
+GPU_TEST_P(CvtColor, RGB2HLS4_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1295,7 +1293,7 @@ TEST_P(CvtColor, RGB2HLS4_FULL)
     EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, RGBA2HLS4_FULL)
+GPU_TEST_P(CvtColor, RGBA2HLS4_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1320,7 +1318,7 @@ TEST_P(CvtColor, RGBA2HLS4_FULL)
     EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HSV2BGR_FULL)
+GPU_TEST_P(CvtColor, HSV2BGR_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1337,7 +1335,7 @@ TEST_P(CvtColor, HSV2BGR_FULL)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HSV2RGB_FULL)
+GPU_TEST_P(CvtColor, HSV2RGB_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1354,7 +1352,7 @@ TEST_P(CvtColor, HSV2RGB_FULL)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HSV42RGB_FULL)
+GPU_TEST_P(CvtColor, HSV42RGB_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1376,7 +1374,7 @@ TEST_P(CvtColor, HSV42RGB_FULL)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HSV42RGBA_FULL)
+GPU_TEST_P(CvtColor, HSV42RGBA_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1398,7 +1396,7 @@ TEST_P(CvtColor, HSV42RGBA_FULL)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HLS2BGR_FULL)
+GPU_TEST_P(CvtColor, HLS2BGR_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1415,7 +1413,7 @@ TEST_P(CvtColor, HLS2BGR_FULL)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HLS2RGB_FULL)
+GPU_TEST_P(CvtColor, HLS2RGB_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1432,7 +1430,7 @@ TEST_P(CvtColor, HLS2RGB_FULL)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HLS42RGB_FULL)
+GPU_TEST_P(CvtColor, HLS42RGB_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1454,7 +1452,7 @@ TEST_P(CvtColor, HLS42RGB_FULL)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, HLS42RGBA_FULL)
+GPU_TEST_P(CvtColor, HLS42RGBA_FULL)
 {
     if (depth == CV_16U)
         return;
@@ -1476,7 +1474,7 @@ TEST_P(CvtColor, HLS42RGBA_FULL)
     EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
-TEST_P(CvtColor, BGR2YUV)
+GPU_TEST_P(CvtColor, BGR2YUV)
 {
     cv::Mat src = img;
 
@@ -1489,7 +1487,7 @@ TEST_P(CvtColor, BGR2YUV)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, RGB2YUV)
+GPU_TEST_P(CvtColor, RGB2YUV)
 {
     cv::Mat src = img;
 
@@ -1502,7 +1500,7 @@ TEST_P(CvtColor, RGB2YUV)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, YUV2BGR)
+GPU_TEST_P(CvtColor, YUV2BGR)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2YUV);
@@ -1516,7 +1514,7 @@ TEST_P(CvtColor, YUV2BGR)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, YUV42BGR)
+GPU_TEST_P(CvtColor, YUV42BGR)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2YUV);
@@ -1535,7 +1533,7 @@ TEST_P(CvtColor, YUV42BGR)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, YUV42BGRA)
+GPU_TEST_P(CvtColor, YUV42BGRA)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2YUV);
@@ -1554,7 +1552,7 @@ TEST_P(CvtColor, YUV42BGRA)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, YUV2RGB)
+GPU_TEST_P(CvtColor, YUV2RGB)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_RGB2YUV);
@@ -1568,7 +1566,7 @@ TEST_P(CvtColor, YUV2RGB)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
 
-TEST_P(CvtColor, BGR2YUV4)
+GPU_TEST_P(CvtColor, BGR2YUV4)
 {
     cv::Mat src = img;
 
@@ -1589,7 +1587,7 @@ TEST_P(CvtColor, BGR2YUV4)
     EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
 }
 
-TEST_P(CvtColor, RGBA2YUV4)
+GPU_TEST_P(CvtColor, RGBA2YUV4)
 {
     cv::Mat src;
     cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
@@ -1611,147 +1609,91 @@ TEST_P(CvtColor, RGBA2YUV4)
     EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
 }
 
-TEST_P(CvtColor, BGR2Lab)
+#if defined (CUDA_VERSION) && (CUDA_VERSION >= 5000)
+
+GPU_TEST_P(CvtColor, BGR2Lab)
 {
     if (depth != CV_8U)
         return;
 
-    try
-    {
-        cv::Mat src = readImage("stereobm/aloe-L.png");
+    cv::Mat src = readImage("stereobm/aloe-L.png");
 
-        cv::gpu::GpuMat dst_lab = createMat(src.size(), src.type(), useRoi);
-        cv::gpu::cvtColor(loadMat(src, useRoi), dst_lab, cv::COLOR_BGR2Lab);
+    cv::gpu::GpuMat dst_lab = createMat(src.size(), src.type(), useRoi);
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst_lab, cv::COLOR_BGR2Lab);
 
-        cv::gpu::GpuMat dst_bgr = createMat(src.size(), src.type(), useRoi);
-        cv::gpu::cvtColor(dst_lab, dst_bgr, cv::COLOR_Lab2BGR);
+    cv::gpu::GpuMat dst_bgr = createMat(src.size(), src.type(), useRoi);
+    cv::gpu::cvtColor(dst_lab, dst_bgr, cv::COLOR_Lab2BGR);
 
-        EXPECT_MAT_NEAR(src, dst_bgr, 10);
-    }
-    catch (const cv::Exception& e)
-    {
-        (void)e;
-#if defined (CUDA_VERSION) && (CUDA_VERSION < 5000)
-        ASSERT_EQ(CV_StsBadFlag, e.code);
-#else
-        FAIL();
-#endif
-    }
+    EXPECT_MAT_NEAR(src, dst_bgr, 10);
 }
 
-TEST_P(CvtColor, RGB2Lab)
+GPU_TEST_P(CvtColor, RGB2Lab)
 {
     if (depth != CV_8U)
         return;
 
-    try
-    {
-        cv::Mat src = readImage("stereobm/aloe-L.png");
+    cv::Mat src = readImage("stereobm/aloe-L.png");
 
-        cv::gpu::GpuMat dst_lab = createMat(src.size(), src.type(), useRoi);
-        cv::gpu::cvtColor(loadMat(src, useRoi), dst_lab, cv::COLOR_RGB2Lab);
+    cv::gpu::GpuMat dst_lab = createMat(src.size(), src.type(), useRoi);
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst_lab, cv::COLOR_RGB2Lab);
 
-        cv::gpu::GpuMat dst_bgr = createMat(src.size(), src.type(), useRoi);
-        cv::gpu::cvtColor(dst_lab, dst_bgr, cv::COLOR_Lab2RGB);
+    cv::gpu::GpuMat dst_bgr = createMat(src.size(), src.type(), useRoi);
+    cv::gpu::cvtColor(dst_lab, dst_bgr, cv::COLOR_Lab2RGB);
 
-        EXPECT_MAT_NEAR(src, dst_bgr, 10);
-    }
-    catch (const cv::Exception& e)
-    {
-        (void)e;
-#if defined (CUDA_VERSION) && (CUDA_VERSION < 5000)
-        ASSERT_EQ(CV_StsBadFlag, e.code);
-#else
-        FAIL();
-#endif
-    }
+    EXPECT_MAT_NEAR(src, dst_bgr, 10);
 }
 
-TEST_P(CvtColor, BGR2Luv)
+GPU_TEST_P(CvtColor, BGR2Luv)
 {
     if (depth != CV_8U)
         return;
 
-    try
-    {
-        cv::Mat src = img;
+    cv::Mat src = img;
 
-        cv::gpu::GpuMat dst_luv = createMat(src.size(), src.type(), useRoi);
-        cv::gpu::cvtColor(loadMat(src, useRoi), dst_luv, cv::COLOR_BGR2Luv);
+    cv::gpu::GpuMat dst_luv = createMat(src.size(), src.type(), useRoi);
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst_luv, cv::COLOR_BGR2Luv);
 
-        cv::gpu::GpuMat dst_rgb = createMat(src.size(), src.type(), useRoi);
-        cv::gpu::cvtColor(dst_luv, dst_rgb, cv::COLOR_Luv2BGR);
+    cv::gpu::GpuMat dst_rgb = createMat(src.size(), src.type(), useRoi);
+    cv::gpu::cvtColor(dst_luv, dst_rgb, cv::COLOR_Luv2BGR);
 
-        EXPECT_MAT_NEAR(src, dst_rgb, 10);
-    }
-    catch (const cv::Exception& e)
-    {
-        (void)e;
-#if defined (CUDA_VERSION) && (CUDA_VERSION < 5000)
-        ASSERT_EQ(CV_StsBadFlag, e.code);
-#else
-        FAIL();
-#endif
-    }
+    EXPECT_MAT_NEAR(src, dst_rgb, 10);
 }
 
-TEST_P(CvtColor, RGB2Luv)
+GPU_TEST_P(CvtColor, RGB2Luv)
 {
     if (depth != CV_8U)
         return;
 
-    try
-    {
-        cv::Mat src = img;
+    cv::Mat src = img;
 
-        cv::gpu::GpuMat dst_luv = createMat(src.size(), src.type(), useRoi);
-        cv::gpu::cvtColor(loadMat(src, useRoi), dst_luv, cv::COLOR_RGB2Luv);
+    cv::gpu::GpuMat dst_luv = createMat(src.size(), src.type(), useRoi);
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst_luv, cv::COLOR_RGB2Luv);
 
-        cv::gpu::GpuMat dst_rgb = createMat(src.size(), src.type(), useRoi);
-        cv::gpu::cvtColor(dst_luv, dst_rgb, cv::COLOR_Luv2RGB);
+    cv::gpu::GpuMat dst_rgb = createMat(src.size(), src.type(), useRoi);
+    cv::gpu::cvtColor(dst_luv, dst_rgb, cv::COLOR_Luv2RGB);
 
-        EXPECT_MAT_NEAR(src, dst_rgb, 10);
-    }
-    catch (const cv::Exception& e)
-    {
-        (void)e;
-#if defined (CUDA_VERSION) && (CUDA_VERSION < 5000)
-        ASSERT_EQ(CV_StsBadFlag, e.code);
-#else
-        FAIL();
-#endif
-    }
+    EXPECT_MAT_NEAR(src, dst_rgb, 10);
 }
 
-TEST_P(CvtColor, RGBA2mRGBA)
+GPU_TEST_P(CvtColor, RGBA2mRGBA)
 {
     if (depth != CV_8U)
         return;
 
-    try
-    {
-        cv::Mat src = randomMat(size, CV_MAKE_TYPE(depth, 4));
+    cv::Mat src = randomMat(size, CV_MAKE_TYPE(depth, 4));
 
-        cv::gpu::GpuMat dst = createMat(src.size(), src.type(), useRoi);
-        cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2mRGBA);
+    cv::gpu::GpuMat dst = createMat(src.size(), src.type(), useRoi);
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2mRGBA);
 
-        cv::Mat dst_gold;
-        cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2mRGBA);
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2mRGBA);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, 1);
-    }
-    catch (const cv::Exception& e)
-    {
-        (void)e;
-#if defined (CUDA_VERSION) && (CUDA_VERSION < 5000)
-        ASSERT_EQ(CV_StsBadFlag, e.code);
-#else
-        FAIL();
-#endif
-    }
+    EXPECT_MAT_NEAR(dst_gold, dst, 1);
 }
 
-TEST_P(CvtColor, BayerBG2BGR)
+#endif // defined (CUDA_VERSION) && (CUDA_VERSION >= 5000)
+
+GPU_TEST_P(CvtColor, BayerBG2BGR)
 {
     if ((depth != CV_8U && depth != CV_16U) || useRoi)
         return;
@@ -1767,7 +1709,7 @@ TEST_P(CvtColor, BayerBG2BGR)
     EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
 }
 
-TEST_P(CvtColor, BayerBG2BGR4)
+GPU_TEST_P(CvtColor, BayerBG2BGR4)
 {
     if ((depth != CV_8U && depth != CV_16U) || useRoi)
         return;
@@ -1790,7 +1732,7 @@ TEST_P(CvtColor, BayerBG2BGR4)
     EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
 }
 
-TEST_P(CvtColor, BayerGB2BGR)
+GPU_TEST_P(CvtColor, BayerGB2BGR)
 {
     if ((depth != CV_8U && depth != CV_16U) || useRoi)
         return;
@@ -1806,7 +1748,7 @@ TEST_P(CvtColor, BayerGB2BGR)
     EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
 }
 
-TEST_P(CvtColor, BayerGB2BGR4)
+GPU_TEST_P(CvtColor, BayerGB2BGR4)
 {
     if ((depth != CV_8U && depth != CV_16U) || useRoi)
         return;
@@ -1828,7 +1770,7 @@ TEST_P(CvtColor, BayerGB2BGR4)
     EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
 }
 
-TEST_P(CvtColor, BayerRG2BGR)
+GPU_TEST_P(CvtColor, BayerRG2BGR)
 {
     if ((depth != CV_8U && depth != CV_16U) || useRoi)
         return;
@@ -1844,7 +1786,7 @@ TEST_P(CvtColor, BayerRG2BGR)
     EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
 }
 
-TEST_P(CvtColor, BayerRG2BGR4)
+GPU_TEST_P(CvtColor, BayerRG2BGR4)
 {
     if ((depth != CV_8U && depth != CV_16U) || useRoi)
         return;
@@ -1866,7 +1808,7 @@ TEST_P(CvtColor, BayerRG2BGR4)
     EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
 }
 
-TEST_P(CvtColor, BayerGR2BGR)
+GPU_TEST_P(CvtColor, BayerGR2BGR)
 {
     if ((depth != CV_8U && depth != CV_16U) || useRoi)
         return;
@@ -1882,7 +1824,7 @@ TEST_P(CvtColor, BayerGR2BGR)
     EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
 }
 
-TEST_P(CvtColor, BayerGR2BGR4)
+GPU_TEST_P(CvtColor, BayerGR2BGR4)
 {
     if ((depth != CV_8U && depth != CV_16U) || useRoi)
         return;
@@ -1929,7 +1871,7 @@ PARAM_TEST_CASE(SwapChannels, cv::gpu::DeviceInfo, cv::Size, UseRoi)
     }
 };
 
-TEST_P(SwapChannels, Accuracy)
+GPU_TEST_P(SwapChannels, Accuracy)
 {
     cv::Mat src = readImageType("stereobm/aloe-L.png", CV_8UC4);
     ASSERT_FALSE(src.empty());
@@ -1950,6 +1892,4 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, SwapChannels, testing::Combine(
     DIFFERENT_SIZES,
     WHOLE_SUBMAT));
 
-} // namespace
-
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_copy_make_border.cpp b/modules/gpu/test/test_copy_make_border.cpp
index 0efd9ec689..0b59fe2d8a 100644
--- a/modules/gpu/test/test_copy_make_border.cpp
+++ b/modules/gpu/test/test_copy_make_border.cpp
@@ -43,9 +43,10 @@
 
 #ifdef HAVE_CUDA
 
-namespace {
-
-IMPLEMENT_PARAM_CLASS(Border, int)
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(Border, int)
+}
 
 PARAM_TEST_CASE(CopyMakeBorder, cv::gpu::DeviceInfo, cv::Size, MatType, Border, BorderType, UseRoi)
 {
@@ -69,7 +70,7 @@ PARAM_TEST_CASE(CopyMakeBorder, cv::gpu::DeviceInfo, cv::Size, MatType, Border,
     }
 };
 
-TEST_P(CopyMakeBorder, Accuracy)
+GPU_TEST_P(CopyMakeBorder, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
     cv::Scalar val = randomScalar(0, 255);
@@ -99,6 +100,4 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
     ALL_BORDER_TYPES,
     WHOLE_SUBMAT));
 
-} // namespace
-
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_core.cpp b/modules/gpu/test/test_core.cpp
index 13bc2d2f4e..e6745abe33 100644
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -43,8 +43,6 @@
 
 #ifdef HAVE_CUDA
 
-namespace {
-
 ////////////////////////////////////////////////////////////////////////////////
 // Merge
 
@@ -68,7 +66,7 @@ PARAM_TEST_CASE(Merge, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, UseRoi
     }
 };
 
-TEST_P(Merge, Accuracy)
+GPU_TEST_P(Merge, Accuracy)
 {
     std::vector<cv::Mat> src;
     src.reserve(channels);
@@ -137,7 +135,7 @@ PARAM_TEST_CASE(Split, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, UseRoi
     }
 };
 
-TEST_P(Split, Accuracy)
+GPU_TEST_P(Split, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
 
@@ -206,11 +204,10 @@ PARAM_TEST_CASE(Add_Array, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, Ma
     }
 };
 
-TEST_P(Add_Array, Accuracy)
+GPU_TEST_P(Add_Array, Accuracy)
 {
     cv::Mat mat1 = randomMat(size, stype);
     cv::Mat mat2 = randomMat(size, stype);
-    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
 
     if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
     {
@@ -228,10 +225,10 @@ TEST_P(Add_Array, Accuracy)
     {
         cv::gpu::GpuMat dst = createMat(size, dtype, useRoi);
         dst.setTo(cv::Scalar::all(0));
-        cv::gpu::add(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, channels == 1 ? loadMat(mask, useRoi) : cv::gpu::GpuMat(), depth.second);
+        cv::gpu::add(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, cv::gpu::GpuMat(), depth.second);
 
         cv::Mat dst_gold(size, dtype, cv::Scalar::all(0));
-        cv::add(mat1, mat2, dst_gold, channels == 1 ? mask : cv::noArray(), depth.second);
+        cv::add(mat1, mat2, dst_gold, cv::noArray(), depth.second);
 
         EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
     }
@@ -244,6 +241,67 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Add_Array, testing::Combine(
     ALL_CHANNELS,
     WHOLE_SUBMAT));
 
+PARAM_TEST_CASE(Add_Array_Mask, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    int stype;
+    int dtype;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        stype = CV_MAKE_TYPE(depth.first, 1);
+        dtype = CV_MAKE_TYPE(depth.second, 1);
+    }
+};
+
+GPU_TEST_P(Add_Array_Mask, Accuracy)
+{
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::add(loadMat(mat1), loadMat(mat2), dst, cv::gpu::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, dtype, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::gpu::add(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, loadMat(mask, useRoi), depth.second);
+
+        cv::Mat dst_gold(size, dtype, cv::Scalar::all(0));
+        cv::add(mat1, mat2, dst_gold, mask, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Add_Array_Mask, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
 ////////////////////////////////////////////////////////////////////////////////
 // Add_Scalar
 
@@ -265,7 +323,7 @@ PARAM_TEST_CASE(Add_Scalar, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, M
     }
 };
 
-TEST_P(Add_Scalar, WithOutMask)
+GPU_TEST_P(Add_Scalar, WithOutMask)
 {
     cv::Mat mat = randomMat(size, depth.first);
     cv::Scalar val = randomScalar(0, 255);
@@ -295,7 +353,7 @@ TEST_P(Add_Scalar, WithOutMask)
     }
 }
 
-TEST_P(Add_Scalar, WithMask)
+GPU_TEST_P(Add_Scalar, WithMask)
 {
     cv::Mat mat = randomMat(size, depth.first);
     cv::Scalar val = randomScalar(0, 255);
@@ -361,7 +419,68 @@ PARAM_TEST_CASE(Subtract_Array, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDept
     }
 };
 
-TEST_P(Subtract_Array, Accuracy)
+GPU_TEST_P(Subtract_Array, Accuracy)
+{
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::subtract(loadMat(mat1), loadMat(mat2), dst, cv::gpu::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, dtype, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::gpu::subtract(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, cv::gpu::GpuMat(), depth.second);
+
+        cv::Mat dst_gold(size, dtype, cv::Scalar::all(0));
+        cv::subtract(mat1, mat2, dst_gold, cv::noArray(), depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Subtract_Array, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    ALL_CHANNELS,
+    WHOLE_SUBMAT));
+
+PARAM_TEST_CASE(Subtract_Array_Mask, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    int stype;
+    int dtype;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        stype = CV_MAKE_TYPE(depth.first, 1);
+        dtype = CV_MAKE_TYPE(depth.second, 1);
+    }
+};
+
+GPU_TEST_P(Subtract_Array_Mask, Accuracy)
 {
     cv::Mat mat1 = randomMat(size, stype);
     cv::Mat mat2 = randomMat(size, stype);
@@ -383,20 +502,19 @@ TEST_P(Subtract_Array, Accuracy)
     {
         cv::gpu::GpuMat dst = createMat(size, dtype, useRoi);
         dst.setTo(cv::Scalar::all(0));
-        cv::gpu::subtract(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, channels == 1 ? loadMat(mask, useRoi) : cv::gpu::GpuMat(), depth.second);
+        cv::gpu::subtract(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, loadMat(mask, useRoi), depth.second);
 
         cv::Mat dst_gold(size, dtype, cv::Scalar::all(0));
-        cv::subtract(mat1, mat2, dst_gold, channels == 1 ? mask : cv::noArray(), depth.second);
+        cv::subtract(mat1, mat2, dst_gold, mask, depth.second);
 
         EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
     }
 }
 
-INSTANTIATE_TEST_CASE_P(GPU_Core, Subtract_Array, testing::Combine(
+INSTANTIATE_TEST_CASE_P(GPU_Core, Subtract_Array_Mask, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     DEPTH_PAIRS,
-    ALL_CHANNELS,
     WHOLE_SUBMAT));
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -420,7 +538,7 @@ PARAM_TEST_CASE(Subtract_Scalar, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDep
     }
 };
 
-TEST_P(Subtract_Scalar, WithOutMask)
+GPU_TEST_P(Subtract_Scalar, WithOutMask)
 {
     cv::Mat mat = randomMat(size, depth.first);
     cv::Scalar val = randomScalar(0, 255);
@@ -450,7 +568,7 @@ TEST_P(Subtract_Scalar, WithOutMask)
     }
 }
 
-TEST_P(Subtract_Scalar, WithMask)
+GPU_TEST_P(Subtract_Scalar, WithMask)
 {
     cv::Mat mat = randomMat(size, depth.first);
     cv::Scalar val = randomScalar(0, 255);
@@ -516,7 +634,7 @@ PARAM_TEST_CASE(Multiply_Array, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDept
     }
 };
 
-TEST_P(Multiply_Array, WithOutScale)
+GPU_TEST_P(Multiply_Array, WithOutScale)
 {
     cv::Mat mat1 = randomMat(size, stype);
     cv::Mat mat2 = randomMat(size, stype);
@@ -541,11 +659,11 @@ TEST_P(Multiply_Array, WithOutScale)
         cv::Mat dst_gold;
         cv::multiply(mat1, mat2, dst_gold, 1, depth.second);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-2 : 0.0);
     }
 }
 
-TEST_P(Multiply_Array, WithScale)
+GPU_TEST_P(Multiply_Array, WithScale)
 {
     cv::Mat mat1 = randomMat(size, stype);
     cv::Mat mat2 = randomMat(size, stype);
@@ -571,7 +689,7 @@ TEST_P(Multiply_Array, WithScale)
         cv::Mat dst_gold;
         cv::multiply(mat1, mat2, dst_gold, scale, depth.second);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, 2.0);
     }
 }
 
@@ -601,7 +719,7 @@ PARAM_TEST_CASE(Multiply_Array_Special, cv::gpu::DeviceInfo, cv::Size, UseRoi)
     }
 };
 
-TEST_P(Multiply_Array_Special, Case_8UC4x_32FC1)
+GPU_TEST_P(Multiply_Array_Special, Case_8UC4x_32FC1)
 {
     cv::Mat mat1 = randomMat(size, CV_8UC4);
     cv::Mat mat2 = randomMat(size, CV_32FC1);
@@ -638,7 +756,7 @@ TEST_P(Multiply_Array_Special, Case_8UC4x_32FC1)
     }
 }
 
-TEST_P(Multiply_Array_Special, Case_16SC4x_32FC1)
+GPU_TEST_P(Multiply_Array_Special, Case_16SC4x_32FC1)
 {
     cv::Mat mat1 = randomMat(size, CV_16SC4);
     cv::Mat mat2 = randomMat(size, CV_32FC1);
@@ -701,7 +819,7 @@ PARAM_TEST_CASE(Multiply_Scalar, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDep
     }
 };
 
-TEST_P(Multiply_Scalar, WithOutScale)
+GPU_TEST_P(Multiply_Scalar, WithOutScale)
 {
     cv::Mat mat = randomMat(size, depth.first);
     cv::Scalar val = randomScalar(0, 255);
@@ -726,12 +844,12 @@ TEST_P(Multiply_Scalar, WithOutScale)
         cv::Mat dst_gold;
         cv::multiply(mat, val, dst_gold, 1, depth.second);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-2 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
     }
 }
 
 
-TEST_P(Multiply_Scalar, WithScale)
+GPU_TEST_P(Multiply_Scalar, WithScale)
 {
     cv::Mat mat = randomMat(size, depth.first);
     cv::Scalar val = randomScalar(0, 255);
@@ -757,7 +875,7 @@ TEST_P(Multiply_Scalar, WithScale)
         cv::Mat dst_gold;
         cv::multiply(mat, val, dst_gold, scale, depth.second);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
     }
 }
 
@@ -796,7 +914,7 @@ PARAM_TEST_CASE(Divide_Array, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth,
     }
 };
 
-TEST_P(Divide_Array, WithOutScale)
+GPU_TEST_P(Divide_Array, WithOutScale)
 {
     cv::Mat mat1 = randomMat(size, stype);
     cv::Mat mat2 = randomMat(size, stype, 1.0, 255.0);
@@ -825,8 +943,7 @@ TEST_P(Divide_Array, WithOutScale)
     }
 }
 
-
-TEST_P(Divide_Array, WithScale)
+GPU_TEST_P(Divide_Array, WithScale)
 {
     cv::Mat mat1 = randomMat(size, stype);
     cv::Mat mat2 = randomMat(size, stype, 1.0, 255.0);
@@ -882,7 +999,7 @@ PARAM_TEST_CASE(Divide_Array_Special, cv::gpu::DeviceInfo, cv::Size, UseRoi)
     }
 };
 
-TEST_P(Divide_Array_Special, Case_8UC4x_32FC1)
+GPU_TEST_P(Divide_Array_Special, Case_8UC4x_32FC1)
 {
     cv::Mat mat1 = randomMat(size, CV_8UC4);
     cv::Mat mat2 = randomMat(size, CV_32FC1, 1.0, 255.0);
@@ -919,7 +1036,7 @@ TEST_P(Divide_Array_Special, Case_8UC4x_32FC1)
     }
 }
 
-TEST_P(Divide_Array_Special, Case_16SC4x_32FC1)
+GPU_TEST_P(Divide_Array_Special, Case_16SC4x_32FC1)
 {
     cv::Mat mat1 = randomMat(size, CV_16SC4);
     cv::Mat mat2 = randomMat(size, CV_32FC1, 1.0, 255.0);
@@ -982,7 +1099,7 @@ PARAM_TEST_CASE(Divide_Scalar, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth
     }
 };
 
-TEST_P(Divide_Scalar, WithOutScale)
+GPU_TEST_P(Divide_Scalar, WithOutScale)
 {
     cv::Mat mat = randomMat(size, depth.first);
     cv::Scalar val = randomScalar(1.0, 255.0);
@@ -1011,7 +1128,7 @@ TEST_P(Divide_Scalar, WithOutScale)
     }
 }
 
-TEST_P(Divide_Scalar, WithScale)
+GPU_TEST_P(Divide_Scalar, WithScale)
 {
     cv::Mat mat = randomMat(size, depth.first);
     cv::Scalar val = randomScalar(1.0, 255.0);
@@ -1037,7 +1154,7 @@ TEST_P(Divide_Scalar, WithScale)
         cv::Mat dst_gold;
         cv::divide(mat, val, dst_gold, scale, depth.second);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-2 : 0.0);
     }
 }
 
@@ -1068,7 +1185,7 @@ PARAM_TEST_CASE(Divide_Scalar_Inv, cv::gpu::DeviceInfo, cv::Size, std::pair<MatD
     }
 };
 
-TEST_P(Divide_Scalar_Inv, Accuracy)
+GPU_TEST_P(Divide_Scalar_Inv, Accuracy)
 {
     double scale = randomDouble(0.0, 255.0);
     cv::Mat mat = randomMat(size, depth.first, 1.0, 255.0);
@@ -1124,7 +1241,7 @@ PARAM_TEST_CASE(AbsDiff, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
     }
 };
 
-TEST_P(AbsDiff, Array)
+GPU_TEST_P(AbsDiff, Array)
 {
     cv::Mat src1 = randomMat(size, depth);
     cv::Mat src2 = randomMat(size, depth);
@@ -1153,7 +1270,7 @@ TEST_P(AbsDiff, Array)
     }
 }
 
-TEST_P(AbsDiff, Scalar)
+GPU_TEST_P(AbsDiff, Scalar)
 {
     cv::Mat src = randomMat(size, depth);
     cv::Scalar val = randomScalar(0.0, 255.0);
@@ -1209,7 +1326,7 @@ PARAM_TEST_CASE(Abs, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
     }
 };
 
-TEST_P(Abs, Accuracy)
+GPU_TEST_P(Abs, Accuracy)
 {
     cv::Mat src = randomMat(size, depth);
 
@@ -1248,7 +1365,7 @@ PARAM_TEST_CASE(Sqr, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
     }
 };
 
-TEST_P(Sqr, Accuracy)
+GPU_TEST_P(Sqr, Accuracy)
 {
     cv::Mat src = randomMat(size, depth, 0, depth == CV_8U ? 16 : 255);
 
@@ -1273,28 +1390,31 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Sqr, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // Sqrt
 
-template <typename T> void sqrtImpl(const cv::Mat& src, cv::Mat& dst)
+namespace
 {
-    dst.create(src.size(), src.type());
-
-    for (int y = 0; y < src.rows; ++y)
+    template <typename T> void sqrtImpl(const cv::Mat& src, cv::Mat& dst)
     {
-        for (int x = 0; x < src.cols; ++x)
-            dst.at<T>(y, x) = static_cast<T>(std::sqrt(static_cast<float>(src.at<T>(y, x))));
+        dst.create(src.size(), src.type());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+                dst.at<T>(y, x) = static_cast<T>(std::sqrt(static_cast<float>(src.at<T>(y, x))));
+        }
     }
-}
 
-void sqrtGold(const cv::Mat& src, cv::Mat& dst)
-{
-    typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst);
-
-    const func_t funcs[] =
+    void sqrtGold(const cv::Mat& src, cv::Mat& dst)
     {
-        sqrtImpl<uchar>, sqrtImpl<schar>, sqrtImpl<ushort>, sqrtImpl<short>,
-        sqrtImpl<int>, sqrtImpl<float>
-    };
+        typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst);
 
-    funcs[src.depth()](src, dst);
+        const func_t funcs[] =
+        {
+            sqrtImpl<uchar>, sqrtImpl<schar>, sqrtImpl<ushort>, sqrtImpl<short>,
+            sqrtImpl<int>, sqrtImpl<float>
+        };
+
+        funcs[src.depth()](src, dst);
+    }
 }
 
 PARAM_TEST_CASE(Sqrt, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
@@ -1315,7 +1435,7 @@ PARAM_TEST_CASE(Sqrt, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
     }
 };
 
-TEST_P(Sqrt, Accuracy)
+GPU_TEST_P(Sqrt, Accuracy)
 {
     cv::Mat src = randomMat(size, depth);
 
@@ -1340,28 +1460,31 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Sqrt, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // Log
 
-template <typename T> void logImpl(const cv::Mat& src, cv::Mat& dst)
+namespace
 {
-    dst.create(src.size(), src.type());
-
-    for (int y = 0; y < src.rows; ++y)
+    template <typename T> void logImpl(const cv::Mat& src, cv::Mat& dst)
     {
-        for (int x = 0; x < src.cols; ++x)
-            dst.at<T>(y, x) = static_cast<T>(std::log(static_cast<float>(src.at<T>(y, x))));
+        dst.create(src.size(), src.type());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+                dst.at<T>(y, x) = static_cast<T>(std::log(static_cast<float>(src.at<T>(y, x))));
+        }
     }
-}
 
-void logGold(const cv::Mat& src, cv::Mat& dst)
-{
-    typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst);
-
-    const func_t funcs[] =
+    void logGold(const cv::Mat& src, cv::Mat& dst)
     {
-        logImpl<uchar>, logImpl<schar>, logImpl<ushort>, logImpl<short>,
-        logImpl<int>, logImpl<float>
-    };
+        typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst);
 
-    funcs[src.depth()](src, dst);
+        const func_t funcs[] =
+        {
+            logImpl<uchar>, logImpl<schar>, logImpl<ushort>, logImpl<short>,
+            logImpl<int>, logImpl<float>
+        };
+
+        funcs[src.depth()](src, dst);
+    }
 }
 
 PARAM_TEST_CASE(Log, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
@@ -1382,7 +1505,7 @@ PARAM_TEST_CASE(Log, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
     }
 };
 
-TEST_P(Log, Accuracy)
+GPU_TEST_P(Log, Accuracy)
 {
     cv::Mat src = randomMat(size, depth, 1.0, 255.0);
 
@@ -1407,38 +1530,41 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Log, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // Exp
 
-template <typename T> void expImpl(const cv::Mat& src, cv::Mat& dst)
+namespace
 {
-    dst.create(src.size(), src.type());
-
-    for (int y = 0; y < src.rows; ++y)
+    template <typename T> void expImpl(const cv::Mat& src, cv::Mat& dst)
     {
-        for (int x = 0; x < src.cols; ++x)
-            dst.at<T>(y, x) = cv::saturate_cast<T>(static_cast<int>(std::exp(static_cast<float>(src.at<T>(y, x)))));
+        dst.create(src.size(), src.type());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+                dst.at<T>(y, x) = cv::saturate_cast<T>(static_cast<int>(std::exp(static_cast<float>(src.at<T>(y, x)))));
+        }
     }
-}
-void expImpl_float(const cv::Mat& src, cv::Mat& dst)
-{
-    dst.create(src.size(), src.type());
-
-    for (int y = 0; y < src.rows; ++y)
+    void expImpl_float(const cv::Mat& src, cv::Mat& dst)
     {
-        for (int x = 0; x < src.cols; ++x)
-            dst.at<float>(y, x) = std::exp(static_cast<float>(src.at<float>(y, x)));
+        dst.create(src.size(), src.type());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+                dst.at<float>(y, x) = std::exp(static_cast<float>(src.at<float>(y, x)));
+        }
     }
-}
 
-void expGold(const cv::Mat& src, cv::Mat& dst)
-{
-    typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst);
-
-    const func_t funcs[] =
+    void expGold(const cv::Mat& src, cv::Mat& dst)
     {
-        expImpl<uchar>, expImpl<schar>, expImpl<ushort>, expImpl<short>,
-        expImpl<int>, expImpl_float
-    };
+        typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst);
 
-    funcs[src.depth()](src, dst);
+        const func_t funcs[] =
+        {
+            expImpl<uchar>, expImpl<schar>, expImpl<ushort>, expImpl<short>,
+            expImpl<int>, expImpl_float
+        };
+
+        funcs[src.depth()](src, dst);
+    }
 }
 
 PARAM_TEST_CASE(Exp, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
@@ -1459,7 +1585,7 @@ PARAM_TEST_CASE(Exp, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
     }
 };
 
-TEST_P(Exp, Accuracy)
+GPU_TEST_P(Exp, Accuracy)
 {
     cv::Mat src = randomMat(size, depth, 0.0, 10.0);
 
@@ -1507,7 +1633,7 @@ PARAM_TEST_CASE(Compare_Array, cv::gpu::DeviceInfo, cv::Size, MatDepth, CmpCode,
     }
 };
 
-TEST_P(Compare_Array, Accuracy)
+GPU_TEST_P(Compare_Array, Accuracy)
 {
     cv::Mat src1 = randomMat(size, depth);
     cv::Mat src2 = randomMat(size, depth);
@@ -1609,7 +1735,7 @@ PARAM_TEST_CASE(Compare_Scalar, cv::gpu::DeviceInfo, cv::Size, MatType, CmpCode,
     }
 };
 
-TEST_P(Compare_Scalar, Accuracy)
+GPU_TEST_P(Compare_Scalar, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
     cv::Scalar sc = randomScalar(0.0, 255.0);
@@ -1679,7 +1805,7 @@ PARAM_TEST_CASE(Bitwise_Array, cv::gpu::DeviceInfo, cv::Size, MatType)
     }
 };
 
-TEST_P(Bitwise_Array, Not)
+GPU_TEST_P(Bitwise_Array, Not)
 {
     cv::gpu::GpuMat dst;
     cv::gpu::bitwise_not(loadMat(src1), dst);
@@ -1689,7 +1815,7 @@ TEST_P(Bitwise_Array, Not)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(Bitwise_Array, Or)
+GPU_TEST_P(Bitwise_Array, Or)
 {
     cv::gpu::GpuMat dst;
     cv::gpu::bitwise_or(loadMat(src1), loadMat(src2), dst);
@@ -1699,7 +1825,7 @@ TEST_P(Bitwise_Array, Or)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(Bitwise_Array, And)
+GPU_TEST_P(Bitwise_Array, And)
 {
     cv::gpu::GpuMat dst;
     cv::gpu::bitwise_and(loadMat(src1), loadMat(src2), dst);
@@ -1709,7 +1835,7 @@ TEST_P(Bitwise_Array, And)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(Bitwise_Array, Xor)
+GPU_TEST_P(Bitwise_Array, Xor)
 {
     cv::gpu::GpuMat dst;
     cv::gpu::bitwise_xor(loadMat(src1), loadMat(src2), dst);
@@ -1752,7 +1878,7 @@ PARAM_TEST_CASE(Bitwise_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channel
     }
 };
 
-TEST_P(Bitwise_Scalar, Or)
+GPU_TEST_P(Bitwise_Scalar, Or)
 {
     cv::gpu::GpuMat dst;
     cv::gpu::bitwise_or(loadMat(src), val, dst);
@@ -1763,7 +1889,7 @@ TEST_P(Bitwise_Scalar, Or)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(Bitwise_Scalar, And)
+GPU_TEST_P(Bitwise_Scalar, And)
 {
     cv::gpu::GpuMat dst;
     cv::gpu::bitwise_and(loadMat(src), val, dst);
@@ -1774,7 +1900,7 @@ TEST_P(Bitwise_Scalar, And)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(Bitwise_Scalar, Xor)
+GPU_TEST_P(Bitwise_Scalar, Xor)
 {
     cv::gpu::GpuMat dst;
     cv::gpu::bitwise_xor(loadMat(src), val, dst);
@@ -1794,32 +1920,35 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Bitwise_Scalar, testing::Combine(
 //////////////////////////////////////////////////////////////////////////////
 // RShift
 
-template <typename T> void rhiftImpl(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
+namespace
 {
-    const int cn = src.channels();
-
-    dst.create(src.size(), src.type());
-
-    for (int y = 0; y < src.rows; ++y)
+    template <typename T> void rhiftImpl(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
     {
-        for (int x = 0; x < src.cols; ++x)
+        const int cn = src.channels();
+
+        dst.create(src.size(), src.type());
+
+        for (int y = 0; y < src.rows; ++y)
         {
-            for (int c = 0; c < cn; ++c)
-                dst.at<T>(y, x * cn + c) = src.at<T>(y, x * cn + c) >> val.val[c];
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = src.at<T>(y, x * cn + c) >> val.val[c];
+            }
         }
     }
-}
 
-void rhiftGold(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
-{
-    typedef void (*func_t)(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst);
-
-    const func_t funcs[] =
+    void rhiftGold(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
     {
-        rhiftImpl<uchar>, rhiftImpl<schar>, rhiftImpl<ushort>, rhiftImpl<short>, rhiftImpl<int>
-    };
+        typedef void (*func_t)(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst);
 
-    funcs[src.depth()](src, val, dst);
+        const func_t funcs[] =
+        {
+            rhiftImpl<uchar>, rhiftImpl<schar>, rhiftImpl<ushort>, rhiftImpl<short>, rhiftImpl<int>
+        };
+
+        funcs[src.depth()](src, val, dst);
+    }
 }
 
 PARAM_TEST_CASE(RShift, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, UseRoi)
@@ -1842,7 +1971,7 @@ PARAM_TEST_CASE(RShift, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, UseRo
     }
 };
 
-TEST_P(RShift, Accuracy)
+GPU_TEST_P(RShift, Accuracy)
 {
     int type = CV_MAKE_TYPE(depth, channels);
     cv::Mat src = randomMat(size, type);
@@ -1871,32 +2000,35 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, RShift, testing::Combine(
 //////////////////////////////////////////////////////////////////////////////
 // LShift
 
-template <typename T> void lhiftImpl(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
+namespace
 {
-    const int cn = src.channels();
-
-    dst.create(src.size(), src.type());
-
-    for (int y = 0; y < src.rows; ++y)
+    template <typename T> void lhiftImpl(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
     {
-        for (int x = 0; x < src.cols; ++x)
+        const int cn = src.channels();
+
+        dst.create(src.size(), src.type());
+
+        for (int y = 0; y < src.rows; ++y)
         {
-            for (int c = 0; c < cn; ++c)
-                dst.at<T>(y, x * cn + c) = src.at<T>(y, x * cn + c) << val.val[c];
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = src.at<T>(y, x * cn + c) << val.val[c];
+            }
         }
     }
-}
 
-void lhiftGold(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
-{
-    typedef void (*func_t)(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst);
-
-    const func_t funcs[] =
+    void lhiftGold(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
     {
-        lhiftImpl<uchar>, lhiftImpl<schar>, lhiftImpl<ushort>, lhiftImpl<short>, lhiftImpl<int>
-    };
+        typedef void (*func_t)(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst);
 
-    funcs[src.depth()](src, val, dst);
+        const func_t funcs[] =
+        {
+            lhiftImpl<uchar>, lhiftImpl<schar>, lhiftImpl<ushort>, lhiftImpl<short>, lhiftImpl<int>
+        };
+
+        funcs[src.depth()](src, val, dst);
+    }
 }
 
 PARAM_TEST_CASE(LShift, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, UseRoi)
@@ -1919,7 +2051,7 @@ PARAM_TEST_CASE(LShift, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, UseRo
     }
 };
 
-TEST_P(LShift, Accuracy)
+GPU_TEST_P(LShift, Accuracy)
 {
     int type = CV_MAKE_TYPE(depth, channels);
     cv::Mat src = randomMat(size, type);
@@ -1962,7 +2094,7 @@ PARAM_TEST_CASE(Min, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
     }
 };
 
-TEST_P(Min, Array)
+GPU_TEST_P(Min, Array)
 {
     cv::Mat src1 = randomMat(size, depth);
     cv::Mat src2 = randomMat(size, depth);
@@ -1990,7 +2122,7 @@ TEST_P(Min, Array)
     }
 }
 
-TEST_P(Min, Scalar)
+GPU_TEST_P(Min, Scalar)
 {
     cv::Mat src = randomMat(size, depth);
     double val = randomDouble(0.0, 255.0);
@@ -2045,7 +2177,7 @@ PARAM_TEST_CASE(Max, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
     }
 };
 
-TEST_P(Max, Array)
+GPU_TEST_P(Max, Array)
 {
     cv::Mat src1 = randomMat(size, depth);
     cv::Mat src2 = randomMat(size, depth);
@@ -2073,7 +2205,7 @@ TEST_P(Max, Array)
     }
 }
 
-TEST_P(Max, Scalar)
+GPU_TEST_P(Max, Scalar)
 {
     cv::Mat src = randomMat(size, depth);
     double val = randomDouble(0.0, 255.0);
@@ -2128,7 +2260,7 @@ PARAM_TEST_CASE(Pow, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
     }
 };
 
-TEST_P(Pow, Accuracy)
+GPU_TEST_P(Pow, Accuracy)
 {
     cv::Mat src = randomMat(size, depth, 0.0, 10.0);
     double power = randomDouble(2.0, 4.0);
@@ -2191,7 +2323,7 @@ PARAM_TEST_CASE(AddWeighted, cv::gpu::DeviceInfo, cv::Size, MatDepth, MatDepth,
     }
 };
 
-TEST_P(AddWeighted, Accuracy)
+GPU_TEST_P(AddWeighted, Accuracy)
 {
     cv::Mat src1 = randomMat(size, depth1);
     cv::Mat src2 = randomMat(size, depth2);
@@ -2234,6 +2366,8 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, AddWeighted, testing::Combine(
 //////////////////////////////////////////////////////////////////////////////
 // GEMM
 
+#ifdef HAVE_CUBLAS
+
 CV_FLAGS(GemmFlags, 0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T);
 #define ALL_GEMM_FLAGS testing::Values(GemmFlags(0), GemmFlags(cv::GEMM_1_T), GemmFlags(cv::GEMM_2_T), GemmFlags(cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T | cv::GEMM_3_T))
 
@@ -2257,7 +2391,7 @@ PARAM_TEST_CASE(GEMM, cv::gpu::DeviceInfo, cv::Size, MatType, GemmFlags, UseRoi)
     }
 };
 
-TEST_P(GEMM, Accuracy)
+GPU_TEST_P(GEMM, Accuracy)
 {
     cv::Mat src1 = randomMat(size, type, -10.0, 10.0);
     cv::Mat src2 = randomMat(size, type, -10.0, 10.0);
@@ -2265,17 +2399,6 @@ TEST_P(GEMM, Accuracy)
     double alpha = randomDouble(-10.0, 10.0);
     double beta = randomDouble(-10.0, 10.0);
 
-#ifndef HAVE_CUBLAS
-    try
-    {
-        cv::gpu::GpuMat dst;
-        cv::gpu::gemm(loadMat(src1), loadMat(src2), alpha, loadMat(src3), beta, dst, flags);
-    }
-    catch (const cv::Exception& e)
-    {
-        ASSERT_EQ(CV_StsNotImplemented, e.code);
-    }
-#else
     if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
     {
         try
@@ -2310,7 +2433,6 @@ TEST_P(GEMM, Accuracy)
 
         EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1e-10);
     }
-#endif
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Core, GEMM, testing::Combine(
@@ -2320,6 +2442,8 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, GEMM, testing::Combine(
     ALL_GEMM_FLAGS,
     WHOLE_SUBMAT));
 
+#endif // HAVE_CUBLAS
+
 ////////////////////////////////////////////////////////////////////////////////
 // Transpose
 
@@ -2341,7 +2465,7 @@ PARAM_TEST_CASE(Transpose, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
     }
 };
 
-TEST_P(Transpose, Accuracy)
+GPU_TEST_P(Transpose, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
 
@@ -2408,7 +2532,7 @@ PARAM_TEST_CASE(Flip, cv::gpu::DeviceInfo, cv::Size, MatType, FlipCode, UseRoi)
     }
 };
 
-TEST_P(Flip, Accuracy)
+GPU_TEST_P(Flip, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
 
@@ -2460,7 +2584,7 @@ PARAM_TEST_CASE(LUT, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
     }
 };
 
-TEST_P(LUT, OneChannel)
+GPU_TEST_P(LUT, OneChannel)
 {
     cv::Mat src = randomMat(size, type);
     cv::Mat lut = randomMat(cv::Size(256, 1), CV_8UC1);
@@ -2474,7 +2598,7 @@ TEST_P(LUT, OneChannel)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(LUT, MultiChannel)
+GPU_TEST_P(LUT, MultiChannel)
 {
     cv::Mat src = randomMat(size, type);
     cv::Mat lut = randomMat(cv::Size(256, 1), CV_MAKE_TYPE(CV_8U, src.channels()));
@@ -2513,7 +2637,7 @@ PARAM_TEST_CASE(Magnitude, cv::gpu::DeviceInfo, cv::Size, UseRoi)
     }
 };
 
-TEST_P(Magnitude, NPP)
+GPU_TEST_P(Magnitude, NPP)
 {
     cv::Mat src = randomMat(size, CV_32FC2);
 
@@ -2528,7 +2652,7 @@ TEST_P(Magnitude, NPP)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-4);
 }
 
-TEST_P(Magnitude, Sqr_NPP)
+GPU_TEST_P(Magnitude, Sqr_NPP)
 {
     cv::Mat src = randomMat(size, CV_32FC2);
 
@@ -2544,7 +2668,7 @@ TEST_P(Magnitude, Sqr_NPP)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-1);
 }
 
-TEST_P(Magnitude, Accuracy)
+GPU_TEST_P(Magnitude, Accuracy)
 {
     cv::Mat x = randomMat(size, CV_32FC1);
     cv::Mat y = randomMat(size, CV_32FC1);
@@ -2558,7 +2682,7 @@ TEST_P(Magnitude, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, 1e-4);
 }
 
-TEST_P(Magnitude, Sqr_Accuracy)
+GPU_TEST_P(Magnitude, Sqr_Accuracy)
 {
     cv::Mat x = randomMat(size, CV_32FC1);
     cv::Mat y = randomMat(size, CV_32FC1);
@@ -2581,7 +2705,10 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Magnitude, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // Phase
 
-IMPLEMENT_PARAM_CLASS(AngleInDegrees, bool)
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(AngleInDegrees, bool)
+}
 
 PARAM_TEST_CASE(Phase, cv::gpu::DeviceInfo, cv::Size, AngleInDegrees, UseRoi)
 {
@@ -2601,7 +2728,7 @@ PARAM_TEST_CASE(Phase, cv::gpu::DeviceInfo, cv::Size, AngleInDegrees, UseRoi)
     }
 };
 
-TEST_P(Phase, Accuracy)
+GPU_TEST_P(Phase, Accuracy)
 {
     cv::Mat x = randomMat(size, CV_32FC1);
     cv::Mat y = randomMat(size, CV_32FC1);
@@ -2642,7 +2769,7 @@ PARAM_TEST_CASE(CartToPolar, cv::gpu::DeviceInfo, cv::Size, AngleInDegrees, UseR
     }
 };
 
-TEST_P(CartToPolar, Accuracy)
+GPU_TEST_P(CartToPolar, Accuracy)
 {
     cv::Mat x = randomMat(size, CV_32FC1);
     cv::Mat y = randomMat(size, CV_32FC1);
@@ -2686,7 +2813,7 @@ PARAM_TEST_CASE(PolarToCart, cv::gpu::DeviceInfo, cv::Size, AngleInDegrees, UseR
     }
 };
 
-TEST_P(PolarToCart, Accuracy)
+GPU_TEST_P(PolarToCart, Accuracy)
 {
     cv::Mat magnitude = randomMat(size, CV_32FC1);
     cv::Mat angle = randomMat(size, CV_32FC1);
@@ -2728,7 +2855,7 @@ PARAM_TEST_CASE(MeanStdDev, cv::gpu::DeviceInfo, cv::Size, UseRoi)
     }
 };
 
-TEST_P(MeanStdDev, Accuracy)
+GPU_TEST_P(MeanStdDev, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_8UC1);
 
@@ -2788,7 +2915,7 @@ PARAM_TEST_CASE(Norm, cv::gpu::DeviceInfo, cv::Size, MatDepth, NormCode, UseRoi)
     }
 };
 
-TEST_P(Norm, Accuracy)
+GPU_TEST_P(Norm, Accuracy)
 {
     cv::Mat src = randomMat(size, depth);
 
@@ -2832,7 +2959,7 @@ PARAM_TEST_CASE(NormDiff, cv::gpu::DeviceInfo, cv::Size, NormCode, UseRoi)
     }
 };
 
-TEST_P(NormDiff, Accuracy)
+GPU_TEST_P(NormDiff, Accuracy)
 {
     cv::Mat src1 = randomMat(size, CV_8UC1);
     cv::Mat src2 = randomMat(size, CV_8UC1);
@@ -2853,81 +2980,84 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, NormDiff, testing::Combine(
 //////////////////////////////////////////////////////////////////////////////
 // Sum
 
-template <typename T>
-cv::Scalar absSumImpl(const cv::Mat& src)
+namespace
 {
-    const int cn = src.channels();
-
-    cv::Scalar sum = cv::Scalar::all(0);
-
-    for (int y = 0; y < src.rows; ++y)
+    template <typename T>
+    cv::Scalar absSumImpl(const cv::Mat& src)
     {
-        for (int x = 0; x < src.cols; ++x)
+        const int cn = src.channels();
+
+        cv::Scalar sum = cv::Scalar::all(0);
+
+        for (int y = 0; y < src.rows; ++y)
         {
-            for (int c = 0; c < cn; ++c)
-                sum[c] += std::abs(src.at<T>(y, x * cn + c));
-        }
-    }
-
-    return sum;
-}
-
-cv::Scalar absSumGold(const cv::Mat& src)
-{
-    typedef cv::Scalar (*func_t)(const cv::Mat& src);
-
-    static const func_t funcs[] =
-    {
-        absSumImpl<uchar>,
-        absSumImpl<schar>,
-        absSumImpl<ushort>,
-        absSumImpl<short>,
-        absSumImpl<int>,
-        absSumImpl<float>,
-        absSumImpl<double>
-    };
-
-    return funcs[src.depth()](src);
-}
-
-template <typename T>
-cv::Scalar sqrSumImpl(const cv::Mat& src)
-{
-    const int cn = src.channels();
-
-    cv::Scalar sum = cv::Scalar::all(0);
-
-    for (int y = 0; y < src.rows; ++y)
-    {
-        for (int x = 0; x < src.cols; ++x)
-        {
-            for (int c = 0; c < cn; ++c)
+            for (int x = 0; x < src.cols; ++x)
             {
-                const T val = src.at<T>(y, x * cn + c);
-                sum[c] += val * val;
+                for (int c = 0; c < cn; ++c)
+                    sum[c] += std::abs(src.at<T>(y, x * cn + c));
             }
         }
+
+        return sum;
     }
 
-    return sum;
-}
-
-cv::Scalar sqrSumGold(const cv::Mat& src)
-{
-    typedef cv::Scalar (*func_t)(const cv::Mat& src);
-
-    static const func_t funcs[] =
+    cv::Scalar absSumGold(const cv::Mat& src)
     {
-        sqrSumImpl<uchar>,
-        sqrSumImpl<schar>,
-        sqrSumImpl<ushort>,
-        sqrSumImpl<short>,
-        sqrSumImpl<int>,
-        sqrSumImpl<float>,
-        sqrSumImpl<double>
-    };
+        typedef cv::Scalar (*func_t)(const cv::Mat& src);
 
-    return funcs[src.depth()](src);
+        static const func_t funcs[] =
+        {
+            absSumImpl<uchar>,
+            absSumImpl<schar>,
+            absSumImpl<ushort>,
+            absSumImpl<short>,
+            absSumImpl<int>,
+            absSumImpl<float>,
+            absSumImpl<double>
+        };
+
+        return funcs[src.depth()](src);
+    }
+
+    template <typename T>
+    cv::Scalar sqrSumImpl(const cv::Mat& src)
+    {
+        const int cn = src.channels();
+
+        cv::Scalar sum = cv::Scalar::all(0);
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                {
+                    const T val = src.at<T>(y, x * cn + c);
+                    sum[c] += val * val;
+                }
+            }
+        }
+
+        return sum;
+    }
+
+    cv::Scalar sqrSumGold(const cv::Mat& src)
+    {
+        typedef cv::Scalar (*func_t)(const cv::Mat& src);
+
+        static const func_t funcs[] =
+        {
+            sqrSumImpl<uchar>,
+            sqrSumImpl<schar>,
+            sqrSumImpl<ushort>,
+            sqrSumImpl<short>,
+            sqrSumImpl<int>,
+            sqrSumImpl<float>,
+            sqrSumImpl<double>
+        };
+
+        return funcs[src.depth()](src);
+    }
 }
 
 PARAM_TEST_CASE(Sum, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
@@ -2952,7 +3082,7 @@ PARAM_TEST_CASE(Sum, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
     }
 };
 
-TEST_P(Sum, Simple)
+GPU_TEST_P(Sum, Simple)
 {
     cv::Scalar val = cv::gpu::sum(loadMat(src, useRoi));
 
@@ -2961,7 +3091,7 @@ TEST_P(Sum, Simple)
     EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
 }
 
-TEST_P(Sum, Abs)
+GPU_TEST_P(Sum, Abs)
 {
     cv::Scalar val = cv::gpu::absSum(loadMat(src, useRoi));
 
@@ -2970,7 +3100,7 @@ TEST_P(Sum, Abs)
     EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
 }
 
-TEST_P(Sum, Sqr)
+GPU_TEST_P(Sum, Sqr)
 {
     cv::Scalar val = cv::gpu::sqrSum(loadMat(src, useRoi));
 
@@ -2982,7 +3112,7 @@ TEST_P(Sum, Sqr)
 INSTANTIATE_TEST_CASE_P(GPU_Core, Sum, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
-    TYPES(CV_8U, CV_32F, 1, 4),
+    TYPES(CV_8U, CV_64F, 1, 4),
     WHOLE_SUBMAT));
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -3006,7 +3136,7 @@ PARAM_TEST_CASE(MinMax, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
     }
 };
 
-TEST_P(MinMax, WithoutMask)
+GPU_TEST_P(MinMax, WithoutMask)
 {
     cv::Mat src = randomMat(size, depth);
 
@@ -3035,7 +3165,7 @@ TEST_P(MinMax, WithoutMask)
     }
 }
 
-TEST_P(MinMax, WithMask)
+GPU_TEST_P(MinMax, WithMask)
 {
     cv::Mat src = randomMat(size, depth);
     cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
@@ -3065,7 +3195,7 @@ TEST_P(MinMax, WithMask)
     }
 }
 
-TEST_P(MinMax, NullPtr)
+GPU_TEST_P(MinMax, NullPtr)
 {
     cv::Mat src = randomMat(size, depth);
 
@@ -3105,28 +3235,31 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, MinMax, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // MinMaxLoc
 
-template <typename T>
-void expectEqualImpl(const cv::Mat& src, cv::Point loc_gold, cv::Point loc)
+namespace
 {
-    EXPECT_EQ(src.at<T>(loc_gold.y, loc_gold.x), src.at<T>(loc.y, loc.x));
-}
-
-void expectEqual(const cv::Mat& src, cv::Point loc_gold, cv::Point loc)
-{
-    typedef void (*func_t)(const cv::Mat& src, cv::Point loc_gold, cv::Point loc);
-
-    static const func_t funcs[] =
+    template <typename T>
+    void expectEqualImpl(const cv::Mat& src, cv::Point loc_gold, cv::Point loc)
     {
-        expectEqualImpl<uchar>,
-        expectEqualImpl<schar>,
-        expectEqualImpl<ushort>,
-        expectEqualImpl<short>,
-        expectEqualImpl<int>,
-        expectEqualImpl<float>,
-        expectEqualImpl<double>
-    };
+        EXPECT_EQ(src.at<T>(loc_gold.y, loc_gold.x), src.at<T>(loc.y, loc.x));
+    }
 
-    funcs[src.depth()](src, loc_gold, loc);
+    void expectEqual(const cv::Mat& src, cv::Point loc_gold, cv::Point loc)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Point loc_gold, cv::Point loc);
+
+        static const func_t funcs[] =
+        {
+            expectEqualImpl<uchar>,
+            expectEqualImpl<schar>,
+            expectEqualImpl<ushort>,
+            expectEqualImpl<short>,
+            expectEqualImpl<int>,
+            expectEqualImpl<float>,
+            expectEqualImpl<double>
+        };
+
+        funcs[src.depth()](src, loc_gold, loc);
+    }
 }
 
 PARAM_TEST_CASE(MinMaxLoc, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
@@ -3147,7 +3280,7 @@ PARAM_TEST_CASE(MinMaxLoc, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
     }
 };
 
-TEST_P(MinMaxLoc, WithoutMask)
+GPU_TEST_P(MinMaxLoc, WithoutMask)
 {
     cv::Mat src = randomMat(size, depth);
 
@@ -3182,7 +3315,7 @@ TEST_P(MinMaxLoc, WithoutMask)
     }
 }
 
-TEST_P(MinMaxLoc, WithMask)
+GPU_TEST_P(MinMaxLoc, WithMask)
 {
     cv::Mat src = randomMat(size, depth);
     cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
@@ -3218,7 +3351,7 @@ TEST_P(MinMaxLoc, WithMask)
     }
 }
 
-TEST_P(MinMaxLoc, NullPtr)
+GPU_TEST_P(MinMaxLoc, NullPtr)
 {
     cv::Mat src = randomMat(size, depth);
 
@@ -3287,7 +3420,7 @@ PARAM_TEST_CASE(CountNonZero, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
     }
 };
 
-TEST_P(CountNonZero, Accuracy)
+GPU_TEST_P(CountNonZero, Accuracy)
 {
     cv::Mat srcBase = randomMat(size, CV_8U, 0.0, 1.5);
     cv::Mat src;
@@ -3351,13 +3484,20 @@ PARAM_TEST_CASE(Reduce, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, Reduc
         cv::gpu::setDevice(devInfo.deviceID());
 
         type = CV_MAKE_TYPE(depth, channels);
-        dst_depth = (reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN) ? depth : CV_32F;
+
+        if (reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN)
+            dst_depth = depth;
+        else if (reduceOp == CV_REDUCE_SUM)
+            dst_depth = depth == CV_8U ? CV_32S : depth < CV_64F ? CV_32F : depth;
+        else
+            dst_depth = depth < CV_32F ? CV_32F : depth;
+
         dst_type = CV_MAKE_TYPE(dst_depth, channels);
     }
 
 };
 
-TEST_P(Reduce, Rows)
+GPU_TEST_P(Reduce, Rows)
 {
     cv::Mat src = randomMat(size, type);
 
@@ -3370,7 +3510,7 @@ TEST_P(Reduce, Rows)
     EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 0.0 : 0.02);
 }
 
-TEST_P(Reduce, Cols)
+GPU_TEST_P(Reduce, Cols)
 {
     cv::Mat src = randomMat(size, type);
 
@@ -3392,11 +3532,10 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
     testing::Values(MatDepth(CV_8U),
                     MatDepth(CV_16U),
                     MatDepth(CV_16S),
-                    MatDepth(CV_32F)),
+                    MatDepth(CV_32F),
+                    MatDepth(CV_64F)),
     ALL_CHANNELS,
     ALL_REDUCE_CODES,
     WHOLE_SUBMAT));
 
-} // namespace
-
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_denoising.cpp b/modules/gpu/test/test_denoising.cpp
index 80bf3dbe29..fe0c548c71 100644
--- a/modules/gpu/test/test_denoising.cpp
+++ b/modules/gpu/test/test_denoising.cpp
@@ -69,7 +69,7 @@ PARAM_TEST_CASE(BilateralFilter, cv::gpu::DeviceInfo, cv::Size, MatType)
     }
 };
 
-TEST_P(BilateralFilter, Accuracy)
+GPU_TEST_P(BilateralFilter, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
 
@@ -105,7 +105,7 @@ struct BruteForceNonLocalMeans: testing::TestWithParam<cv::gpu::DeviceInfo>
     }
 };
 
-TEST_P(BruteForceNonLocalMeans, Regression)
+GPU_TEST_P(BruteForceNonLocalMeans, Regression)
 {
     using cv::gpu::GpuMat;
 
@@ -134,8 +134,6 @@ TEST_P(BruteForceNonLocalMeans, Regression)
 
 INSTANTIATE_TEST_CASE_P(GPU_Denoising, BruteForceNonLocalMeans, ALL_DEVICES);
 
-
-
 ////////////////////////////////////////////////////////
 // Fast Force Non local means
 
@@ -150,7 +148,7 @@ struct FastNonLocalMeans: testing::TestWithParam<cv::gpu::DeviceInfo>
     }
 };
 
-TEST_P(FastNonLocalMeans, Regression)
+GPU_TEST_P(FastNonLocalMeans, Regression)
 {
     using cv::gpu::GpuMat;
 
@@ -167,8 +165,8 @@ TEST_P(FastNonLocalMeans, Regression)
     fnlmd.labMethod(GpuMat(bgr),  dbgr, 20, 10);
 
 #if 0
-    //dumpImage("denoising/fnlm_denoised_lena_bgr.png", cv::Mat(dbgr));
-    //dumpImage("denoising/fnlm_denoised_lena_gray.png", cv::Mat(dgray));
+    dumpImage("denoising/fnlm_denoised_lena_bgr.png", cv::Mat(dbgr));
+    dumpImage("denoising/fnlm_denoised_lena_gray.png", cv::Mat(dgray));
 #endif
 
     cv::Mat bgr_gold  = readImage("denoising/fnlm_denoised_lena_bgr.png", cv::IMREAD_COLOR);
@@ -181,5 +179,4 @@ TEST_P(FastNonLocalMeans, Regression)
 
 INSTANTIATE_TEST_CASE_P(GPU_Denoising, FastNonLocalMeans, ALL_DEVICES);
 
-
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_features2d.cpp b/modules/gpu/test/test_features2d.cpp
index 4fff37f1a4..3879ac0534 100644
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@@ -43,118 +43,122 @@
 
 #ifdef HAVE_CUDA
 
-namespace {
-
-bool keyPointsEquals(const cv::KeyPoint& p1, const cv::KeyPoint& p2)
+namespace
 {
-    const double maxPtDif = 1.0;
-    const double maxSizeDif = 1.0;
-    const double maxAngleDif = 2.0;
-    const double maxResponseDif = 0.1;
-
-    double dist = cv::norm(p1.pt - p2.pt);
-
-    if (dist < maxPtDif &&
-        fabs(p1.size - p2.size) < maxSizeDif &&
-        abs(p1.angle - p2.angle) < maxAngleDif &&
-        abs(p1.response - p2.response) < maxResponseDif &&
-        p1.octave == p2.octave &&
-        p1.class_id == p2.class_id)
+    bool keyPointsEquals(const cv::KeyPoint& p1, const cv::KeyPoint& p2)
     {
-        return true;
-    }
+        const double maxPtDif = 1.0;
+        const double maxSizeDif = 1.0;
+        const double maxAngleDif = 2.0;
+        const double maxResponseDif = 0.1;
 
-    return false;
-}
+        double dist = cv::norm(p1.pt - p2.pt);
 
-struct KeyPointLess : std::binary_function<cv::KeyPoint, cv::KeyPoint, bool>
-{
-    bool operator()(const cv::KeyPoint& kp1, const cv::KeyPoint& kp2) const
-    {
-        return kp1.pt.y < kp2.pt.y || (kp1.pt.y == kp2.pt.y && kp1.pt.x < kp2.pt.x);
-    }
-};
-
-testing::AssertionResult assertKeyPointsEquals(const char* gold_expr, const char* actual_expr, std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
-{
-    if (gold.size() != actual.size())
-    {
-        return testing::AssertionFailure() << "KeyPoints size mistmach\n"
-                                           << "\"" << gold_expr << "\" : " << gold.size() << "\n"
-                                           << "\"" << actual_expr << "\" : " << actual.size();
-    }
-
-    std::sort(actual.begin(), actual.end(), KeyPointLess());
-    std::sort(gold.begin(), gold.end(), KeyPointLess());
-
-    for (size_t i = 0; i < gold.size(); ++i)
-    {
-        const cv::KeyPoint& p1 = gold[i];
-        const cv::KeyPoint& p2 = actual[i];
-
-        if (!keyPointsEquals(p1, p2))
+        if (dist < maxPtDif &&
+            fabs(p1.size - p2.size) < maxSizeDif &&
+            abs(p1.angle - p2.angle) < maxAngleDif &&
+            abs(p1.response - p2.response) < maxResponseDif &&
+            p1.octave == p2.octave &&
+            p1.class_id == p2.class_id)
         {
-            return testing::AssertionFailure() << "KeyPoints differ at " << i << "\n"
-                                               << "\"" << gold_expr << "\" vs \"" << actual_expr << "\" : \n"
-                                               << "pt : " << testing::PrintToString(p1.pt) << " vs " << testing::PrintToString(p2.pt) << "\n"
-                                               << "size : " << p1.size << " vs " << p2.size << "\n"
-                                               << "angle : " << p1.angle << " vs " << p2.angle << "\n"
-                                               << "response : " << p1.response << " vs " << p2.response << "\n"
-                                               << "octave : " << p1.octave << " vs " << p2.octave << "\n"
-                                               << "class_id : " << p1.class_id << " vs " << p2.class_id;
+            return true;
         }
+
+        return false;
     }
 
-    return ::testing::AssertionSuccess();
-}
-
-#define ASSERT_KEYPOINTS_EQ(gold, actual) EXPECT_PRED_FORMAT2(assertKeyPointsEquals, gold, actual);
-
-int getMatchedPointsCount(std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
-{
-    std::sort(actual.begin(), actual.end(), KeyPointLess());
-    std::sort(gold.begin(), gold.end(), KeyPointLess());
-
-    int validCount = 0;
-
-    for (size_t i = 0; i < gold.size(); ++i)
+    struct KeyPointLess : std::binary_function<cv::KeyPoint, cv::KeyPoint, bool>
     {
-        const cv::KeyPoint& p1 = gold[i];
-        const cv::KeyPoint& p2 = actual[i];
+        bool operator()(const cv::KeyPoint& kp1, const cv::KeyPoint& kp2) const
+        {
+            return kp1.pt.y < kp2.pt.y || (kp1.pt.y == kp2.pt.y && kp1.pt.x < kp2.pt.x);
+        }
+    };
 
-        if (keyPointsEquals(p1, p2))
-            ++validCount;
-    }
-
-    return validCount;
-}
-
-int getMatchedPointsCount(const std::vector<cv::KeyPoint>& keypoints1, const std::vector<cv::KeyPoint>& keypoints2, const std::vector<cv::DMatch>& matches)
-{
-    int validCount = 0;
-
-    for (size_t i = 0; i < matches.size(); ++i)
+    testing::AssertionResult assertKeyPointsEquals(const char* gold_expr, const char* actual_expr, std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
     {
-        const cv::DMatch& m = matches[i];
+        if (gold.size() != actual.size())
+        {
+            return testing::AssertionFailure() << "KeyPoints size mistmach\n"
+                                               << "\"" << gold_expr << "\" : " << gold.size() << "\n"
+                                               << "\"" << actual_expr << "\" : " << actual.size();
+        }
 
-        const cv::KeyPoint& p1 = keypoints1[m.queryIdx];
-        const cv::KeyPoint& p2 = keypoints2[m.trainIdx];
+        std::sort(actual.begin(), actual.end(), KeyPointLess());
+        std::sort(gold.begin(), gold.end(), KeyPointLess());
 
-        if (keyPointsEquals(p1, p2))
-            ++validCount;
+        for (size_t i = 0; i < gold.size(); ++i)
+        {
+            const cv::KeyPoint& p1 = gold[i];
+            const cv::KeyPoint& p2 = actual[i];
+
+            if (!keyPointsEquals(p1, p2))
+            {
+                return testing::AssertionFailure() << "KeyPoints differ at " << i << "\n"
+                                                   << "\"" << gold_expr << "\" vs \"" << actual_expr << "\" : \n"
+                                                   << "pt : " << testing::PrintToString(p1.pt) << " vs " << testing::PrintToString(p2.pt) << "\n"
+                                                   << "size : " << p1.size << " vs " << p2.size << "\n"
+                                                   << "angle : " << p1.angle << " vs " << p2.angle << "\n"
+                                                   << "response : " << p1.response << " vs " << p2.response << "\n"
+                                                   << "octave : " << p1.octave << " vs " << p2.octave << "\n"
+                                                   << "class_id : " << p1.class_id << " vs " << p2.class_id;
+            }
+        }
+
+        return ::testing::AssertionSuccess();
     }
 
-    return validCount;
+    #define ASSERT_KEYPOINTS_EQ(gold, actual) EXPECT_PRED_FORMAT2(assertKeyPointsEquals, gold, actual);
+
+    int getMatchedPointsCount(std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
+    {
+        std::sort(actual.begin(), actual.end(), KeyPointLess());
+        std::sort(gold.begin(), gold.end(), KeyPointLess());
+
+        int validCount = 0;
+
+        for (size_t i = 0; i < gold.size(); ++i)
+        {
+            const cv::KeyPoint& p1 = gold[i];
+            const cv::KeyPoint& p2 = actual[i];
+
+            if (keyPointsEquals(p1, p2))
+                ++validCount;
+        }
+
+        return validCount;
+    }
+
+    int getMatchedPointsCount(const std::vector<cv::KeyPoint>& keypoints1, const std::vector<cv::KeyPoint>& keypoints2, const std::vector<cv::DMatch>& matches)
+    {
+        int validCount = 0;
+
+        for (size_t i = 0; i < matches.size(); ++i)
+        {
+            const cv::DMatch& m = matches[i];
+
+            const cv::KeyPoint& p1 = keypoints1[m.queryIdx];
+            const cv::KeyPoint& p2 = keypoints2[m.trainIdx];
+
+            if (keyPointsEquals(p1, p2))
+                ++validCount;
+        }
+
+        return validCount;
+    }
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // SURF
 
-IMPLEMENT_PARAM_CLASS(SURF_HessianThreshold, double)
-IMPLEMENT_PARAM_CLASS(SURF_Octaves, int)
-IMPLEMENT_PARAM_CLASS(SURF_OctaveLayers, int)
-IMPLEMENT_PARAM_CLASS(SURF_Extended, bool)
-IMPLEMENT_PARAM_CLASS(SURF_Upright, bool)
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(SURF_HessianThreshold, double)
+    IMPLEMENT_PARAM_CLASS(SURF_Octaves, int)
+    IMPLEMENT_PARAM_CLASS(SURF_OctaveLayers, int)
+    IMPLEMENT_PARAM_CLASS(SURF_Extended, bool)
+    IMPLEMENT_PARAM_CLASS(SURF_Upright, bool)
+}
 
 PARAM_TEST_CASE(SURF, cv::gpu::DeviceInfo, SURF_HessianThreshold, SURF_Octaves, SURF_OctaveLayers, SURF_Extended, SURF_Upright)
 {
@@ -178,7 +182,7 @@ PARAM_TEST_CASE(SURF, cv::gpu::DeviceInfo, SURF_HessianThreshold, SURF_Octaves,
     }
 };
 
-TEST_P(SURF, Detector)
+GPU_TEST_P(SURF, Detector)
 {
     cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(image.empty());
@@ -226,7 +230,7 @@ TEST_P(SURF, Detector)
     }
 }
 
-TEST_P(SURF, Detector_Masked)
+GPU_TEST_P(SURF, Detector_Masked)
 {
     cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(image.empty());
@@ -277,7 +281,7 @@ TEST_P(SURF, Detector_Masked)
     }
 }
 
-TEST_P(SURF, Descriptor)
+GPU_TEST_P(SURF, Descriptor)
 {
     cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(image.empty());
@@ -328,7 +332,7 @@ TEST_P(SURF, Descriptor)
         int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches);
         double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();
 
-        EXPECT_GT(matchedRatio, 0.35);
+        EXPECT_GT(matchedRatio, 0.6);
     }
 }
 
@@ -343,8 +347,11 @@ INSTANTIATE_TEST_CASE_P(GPU_Features2D, SURF, testing::Combine(
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // FAST
 
-IMPLEMENT_PARAM_CLASS(FAST_Threshold, int)
-IMPLEMENT_PARAM_CLASS(FAST_NonmaxSupression, bool)
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(FAST_Threshold, int)
+    IMPLEMENT_PARAM_CLASS(FAST_NonmaxSupression, bool)
+}
 
 PARAM_TEST_CASE(FAST, cv::gpu::DeviceInfo, FAST_Threshold, FAST_NonmaxSupression)
 {
@@ -362,7 +369,7 @@ PARAM_TEST_CASE(FAST, cv::gpu::DeviceInfo, FAST_Threshold, FAST_NonmaxSupression
     }
 };
 
-TEST_P(FAST, Accuracy)
+GPU_TEST_P(FAST, Accuracy)
 {
     cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(image.empty());
@@ -402,14 +409,17 @@ INSTANTIATE_TEST_CASE_P(GPU_Features2D, FAST, testing::Combine(
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // ORB
 
-IMPLEMENT_PARAM_CLASS(ORB_FeaturesCount, int)
-IMPLEMENT_PARAM_CLASS(ORB_ScaleFactor, float)
-IMPLEMENT_PARAM_CLASS(ORB_LevelsCount, int)
-IMPLEMENT_PARAM_CLASS(ORB_EdgeThreshold, int)
-IMPLEMENT_PARAM_CLASS(ORB_firstLevel, int)
-IMPLEMENT_PARAM_CLASS(ORB_WTA_K, int)
-IMPLEMENT_PARAM_CLASS(ORB_PatchSize, int)
-IMPLEMENT_PARAM_CLASS(ORB_BlurForDescriptor, bool)
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(ORB_FeaturesCount, int)
+    IMPLEMENT_PARAM_CLASS(ORB_ScaleFactor, float)
+    IMPLEMENT_PARAM_CLASS(ORB_LevelsCount, int)
+    IMPLEMENT_PARAM_CLASS(ORB_EdgeThreshold, int)
+    IMPLEMENT_PARAM_CLASS(ORB_firstLevel, int)
+    IMPLEMENT_PARAM_CLASS(ORB_WTA_K, int)
+    IMPLEMENT_PARAM_CLASS(ORB_PatchSize, int)
+    IMPLEMENT_PARAM_CLASS(ORB_BlurForDescriptor, bool)
+}
 
 CV_ENUM(ORB_ScoreType, cv::ORB::HARRIS_SCORE, cv::ORB::FAST_SCORE)
 
@@ -443,7 +453,7 @@ PARAM_TEST_CASE(ORB, cv::gpu::DeviceInfo, ORB_FeaturesCount, ORB_ScaleFactor, OR
     }
 };
 
-TEST_P(ORB, Accuracy)
+GPU_TEST_P(ORB, Accuracy)
 {
     cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(image.empty());
@@ -505,8 +515,11 @@ INSTANTIATE_TEST_CASE_P(GPU_Features2D, ORB,  testing::Combine(
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // BruteForceMatcher
 
-IMPLEMENT_PARAM_CLASS(DescriptorSize, int)
-IMPLEMENT_PARAM_CLASS(UseMask, bool)
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(DescriptorSize, int)
+    IMPLEMENT_PARAM_CLASS(UseMask, bool)
+}
 
 PARAM_TEST_CASE(BruteForceMatcher, cv::gpu::DeviceInfo, NormCode, DescriptorSize, UseMask)
 {
@@ -568,7 +581,7 @@ PARAM_TEST_CASE(BruteForceMatcher, cv::gpu::DeviceInfo, NormCode, DescriptorSize
     }
 };
 
-TEST_P(BruteForceMatcher, Match_Single)
+GPU_TEST_P(BruteForceMatcher, Match_Single)
 {
     cv::gpu::BFMatcher_GPU matcher(normCode);
 
@@ -595,7 +608,7 @@ TEST_P(BruteForceMatcher, Match_Single)
     ASSERT_EQ(0, badCount);
 }
 
-TEST_P(BruteForceMatcher, Match_Collection)
+GPU_TEST_P(BruteForceMatcher, Match_Collection)
 {
     cv::gpu::BFMatcher_GPU matcher(normCode);
 
@@ -649,7 +662,7 @@ TEST_P(BruteForceMatcher, Match_Collection)
     ASSERT_EQ(0, badCount);
 }
 
-TEST_P(BruteForceMatcher, KnnMatch_2_Single)
+GPU_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 {
     cv::gpu::BFMatcher_GPU matcher(normCode);
 
@@ -688,7 +701,7 @@ TEST_P(BruteForceMatcher, KnnMatch_2_Single)
     ASSERT_EQ(0, badCount);
 }
 
-TEST_P(BruteForceMatcher, KnnMatch_3_Single)
+GPU_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
 {
     cv::gpu::BFMatcher_GPU matcher(normCode);
 
@@ -727,7 +740,7 @@ TEST_P(BruteForceMatcher, KnnMatch_3_Single)
     ASSERT_EQ(0, badCount);
 }
 
-TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
+GPU_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 {
     cv::gpu::BFMatcher_GPU matcher(normCode);
 
@@ -789,7 +802,7 @@ TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
     ASSERT_EQ(0, badCount);
 }
 
-TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
+GPU_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
 {
     cv::gpu::BFMatcher_GPU matcher(normCode);
 
@@ -851,7 +864,7 @@ TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
     ASSERT_EQ(0, badCount);
 }
 
-TEST_P(BruteForceMatcher, RadiusMatch_Single)
+GPU_TEST_P(BruteForceMatcher, RadiusMatch_Single)
 {
     cv::gpu::BFMatcher_GPU matcher(normCode);
 
@@ -900,7 +913,7 @@ TEST_P(BruteForceMatcher, RadiusMatch_Single)
     }
 }
 
-TEST_P(BruteForceMatcher, RadiusMatch_Collection)
+GPU_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
 {
     cv::gpu::BFMatcher_GPU matcher(normCode);
 
@@ -985,6 +998,4 @@ INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine(
     testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304)),
     testing::Values(UseMask(false), UseMask(true))));
 
-} // namespace
-
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_filters.cpp b/modules/gpu/test/test_filters.cpp
index bb070373e4..f6ae4067aa 100644
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@@ -43,27 +43,30 @@
 
 #ifdef HAVE_CUDA
 
-namespace {
-
-IMPLEMENT_PARAM_CLASS(KSize, cv::Size)
-
-cv::Mat getInnerROI(cv::InputArray m_, cv::Size ksize)
+namespace
 {
-    cv::Mat m = getMat(m_);
-    cv::Rect roi(ksize.width, ksize.height, m.cols - 2 * ksize.width, m.rows - 2 * ksize.height);
-    return m(roi);
-}
+    IMPLEMENT_PARAM_CLASS(KSize, cv::Size)
+    IMPLEMENT_PARAM_CLASS(Anchor, cv::Point)
+    IMPLEMENT_PARAM_CLASS(Deriv_X, int)
+    IMPLEMENT_PARAM_CLASS(Deriv_Y, int)
+    IMPLEMENT_PARAM_CLASS(Iterations, int)
 
-cv::Mat getInnerROI(cv::InputArray m, int ksize)
-{
-    return getInnerROI(m, cv::Size(ksize, ksize));
+    cv::Mat getInnerROI(cv::InputArray m_, cv::Size ksize)
+    {
+        cv::Mat m = getMat(m_);
+        cv::Rect roi(ksize.width, ksize.height, m.cols - 2 * ksize.width, m.rows - 2 * ksize.height);
+        return m(roi);
+    }
+
+    cv::Mat getInnerROI(cv::InputArray m, int ksize)
+    {
+        return getInnerROI(m, cv::Size(ksize, ksize));
+    }
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Blur
 
-IMPLEMENT_PARAM_CLASS(Anchor, cv::Point)
-
 PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, UseRoi)
 {
     cv::gpu::DeviceInfo devInfo;
@@ -86,7 +89,7 @@ PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, Use
     }
 };
 
-TEST_P(Blur, Accuracy)
+GPU_TEST_P(Blur, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
 
@@ -110,36 +113,39 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Blur, testing::Combine(
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Sobel
 
-IMPLEMENT_PARAM_CLASS(Deriv_X, int)
-IMPLEMENT_PARAM_CLASS(Deriv_Y, int)
-
-PARAM_TEST_CASE(Sobel, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Deriv_X, Deriv_Y, BorderType, UseRoi)
+PARAM_TEST_CASE(Sobel, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, KSize, Deriv_X, Deriv_Y, BorderType, UseRoi)
 {
     cv::gpu::DeviceInfo devInfo;
     cv::Size size;
-    int type;
+    int depth;
+    int cn;
     cv::Size ksize;
     int dx;
     int dy;
     int borderType;
     bool useRoi;
 
+    int type;
+
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        dx = GET_PARAM(4);
-        dy = GET_PARAM(5);
-        borderType = GET_PARAM(6);
-        useRoi = GET_PARAM(7);
+        depth = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        ksize = GET_PARAM(4);
+        dx = GET_PARAM(5);
+        dy = GET_PARAM(6);
+        borderType = GET_PARAM(7);
+        useRoi = GET_PARAM(8);
 
         cv::gpu::setDevice(devInfo.deviceID());
+
+        type = CV_MAKE_TYPE(depth, cn);
     }
 };
 
-TEST_P(Sobel, Accuracy)
+GPU_TEST_P(Sobel, Accuracy)
 {
     if (dx == 0 && dy == 0)
         return;
@@ -152,13 +158,14 @@ TEST_P(Sobel, Accuracy)
     cv::Mat dst_gold;
     cv::Sobel(src, dst_gold, -1, dx, dy, ksize.width, 1.0, 0.0, borderType);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
+    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Sobel, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
     testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7))),
     testing::Values(Deriv_X(0), Deriv_X(1), Deriv_X(2)),
     testing::Values(Deriv_Y(0), Deriv_Y(1), Deriv_Y(2)),
@@ -171,31 +178,37 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Sobel, testing::Combine(
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Scharr
 
-PARAM_TEST_CASE(Scharr, cv::gpu::DeviceInfo, cv::Size, MatType, Deriv_X, Deriv_Y, BorderType, UseRoi)
+PARAM_TEST_CASE(Scharr, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, Deriv_X, Deriv_Y, BorderType, UseRoi)
 {
     cv::gpu::DeviceInfo devInfo;
     cv::Size size;
-    int type;
+    int depth;
+    int cn;
     int dx;
     int dy;
     int borderType;
     bool useRoi;
 
+    int type;
+
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        dx = GET_PARAM(3);
-        dy = GET_PARAM(4);
-        borderType = GET_PARAM(5);
-        useRoi = GET_PARAM(6);
+        depth = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        dx = GET_PARAM(4);
+        dy = GET_PARAM(5);
+        borderType = GET_PARAM(6);
+        useRoi = GET_PARAM(7);
 
         cv::gpu::setDevice(devInfo.deviceID());
+
+        type = CV_MAKE_TYPE(depth, cn);
     }
 };
 
-TEST_P(Scharr, Accuracy)
+GPU_TEST_P(Scharr, Accuracy)
 {
     if (dx + dy != 1)
         return;
@@ -208,13 +221,14 @@ TEST_P(Scharr, Accuracy)
     cv::Mat dst_gold;
     cv::Scharr(src, dst_gold, -1, dx, dy, 1.0, 0.0, borderType);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
+    EXPECT_MAT_NEAR(getInnerROI(dst_gold, cv::Size(3, 3)), getInnerROI(dst, cv::Size(3, 3)), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Scharr, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
     testing::Values(Deriv_X(0), Deriv_X(1)),
     testing::Values(Deriv_Y(0), Deriv_Y(1)),
     testing::Values(BorderType(cv::BORDER_REFLECT101),
@@ -226,29 +240,35 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Scharr, testing::Combine(
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // GaussianBlur
 
-PARAM_TEST_CASE(GaussianBlur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, BorderType, UseRoi)
+PARAM_TEST_CASE(GaussianBlur, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, KSize, BorderType, UseRoi)
 {
     cv::gpu::DeviceInfo devInfo;
     cv::Size size;
-    int type;
+    int depth;
+    int cn;
     cv::Size ksize;
     int borderType;
     bool useRoi;
 
+    int type;
+
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        borderType = GET_PARAM(4);
-        useRoi = GET_PARAM(5);
+        depth = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        ksize = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
 
         cv::gpu::setDevice(devInfo.deviceID());
+
+        type = CV_MAKE_TYPE(depth, cn);
     }
 };
 
-TEST_P(GaussianBlur, Accuracy)
+GPU_TEST_P(GaussianBlur, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
     double sigma1 = randomDouble(0.1, 1.0);
@@ -281,7 +301,8 @@ TEST_P(GaussianBlur, Accuracy)
 INSTANTIATE_TEST_CASE_P(GPU_Filter, GaussianBlur, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
     testing::Values(KSize(cv::Size(3, 3)),
                     KSize(cv::Size(5, 5)),
                     KSize(cv::Size(7, 7)),
@@ -326,7 +347,7 @@ PARAM_TEST_CASE(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, UseRoi
     }
 };
 
-TEST_P(Laplacian, Accuracy)
+GPU_TEST_P(Laplacian, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
 
@@ -349,8 +370,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Laplacian, testing::Combine(
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Erode
 
-IMPLEMENT_PARAM_CLASS(Iterations, int)
-
 PARAM_TEST_CASE(Erode, cv::gpu::DeviceInfo, cv::Size, MatType, Anchor, Iterations, UseRoi)
 {
     cv::gpu::DeviceInfo devInfo;
@@ -373,7 +392,7 @@ PARAM_TEST_CASE(Erode, cv::gpu::DeviceInfo, cv::Size, MatType, Anchor, Iteration
     }
 };
 
-TEST_P(Erode, Accuracy)
+GPU_TEST_P(Erode, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
     cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
@@ -422,7 +441,7 @@ PARAM_TEST_CASE(Dilate, cv::gpu::DeviceInfo, cv::Size, MatType, Anchor, Iteratio
     }
 };
 
-TEST_P(Dilate, Accuracy)
+GPU_TEST_P(Dilate, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
     cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
@@ -476,7 +495,7 @@ PARAM_TEST_CASE(MorphEx, cv::gpu::DeviceInfo, cv::Size, MatType, MorphOp, Anchor
     }
 };
 
-TEST_P(MorphEx, Accuracy)
+GPU_TEST_P(MorphEx, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
     cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
@@ -530,7 +549,7 @@ PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor,
     }
 };
 
-TEST_P(Filter2D, Accuracy)
+GPU_TEST_P(Filter2D, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
     cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
@@ -553,6 +572,4 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine(
     testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
 
-} // namespace
-
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_global_motion.cpp b/modules/gpu/test/test_global_motion.cpp
index b37d080684..48fc428cee 100644
--- a/modules/gpu/test/test_global_motion.cpp
+++ b/modules/gpu/test/test_global_motion.cpp
@@ -51,7 +51,7 @@ struct CompactPoints : testing::TestWithParam<gpu::DeviceInfo>
     virtual void SetUp() { gpu::setDevice(GetParam().deviceID()); }
 };
 
-TEST_P(CompactPoints, CanCompactizeSmallInput)
+GPU_TEST_P(CompactPoints, CanCompactizeSmallInput)
 {
     Mat src0(1, 3, CV_32FC2);
     src0.at<Point2f>(0,0) = Point2f(0,0);
diff --git a/modules/gpu/test/test_gpumat.cpp b/modules/gpu/test/test_gpumat.cpp
index 7a4a616233..9ece87caa3 100644
--- a/modules/gpu/test/test_gpumat.cpp
+++ b/modules/gpu/test/test_gpumat.cpp
@@ -44,8 +44,6 @@
 
 #ifdef HAVE_CUDA
 
-namespace {
-
 ////////////////////////////////////////////////////////////////////////////////
 // SetTo
 
@@ -67,7 +65,7 @@ PARAM_TEST_CASE(SetTo, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
     }
 };
 
-TEST_P(SetTo, Zero)
+GPU_TEST_P(SetTo, Zero)
 {
     cv::Scalar zero = cv::Scalar::all(0);
 
@@ -77,7 +75,7 @@ TEST_P(SetTo, Zero)
     EXPECT_MAT_NEAR(cv::Mat::zeros(size, type), mat, 0.0);
 }
 
-TEST_P(SetTo, SameVal)
+GPU_TEST_P(SetTo, SameVal)
 {
     cv::Scalar val = cv::Scalar::all(randomDouble(0.0, 255.0));
 
@@ -102,7 +100,7 @@ TEST_P(SetTo, SameVal)
     }
 }
 
-TEST_P(SetTo, DifferentVal)
+GPU_TEST_P(SetTo, DifferentVal)
 {
     cv::Scalar val = randomScalar(0.0, 255.0);
 
@@ -127,7 +125,7 @@ TEST_P(SetTo, DifferentVal)
     }
 }
 
-TEST_P(SetTo, Masked)
+GPU_TEST_P(SetTo, Masked)
 {
     cv::Scalar val = randomScalar(0.0, 255.0);
     cv::Mat mat_gold = randomMat(size, type);
@@ -184,7 +182,7 @@ PARAM_TEST_CASE(CopyTo, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
     }
 };
 
-TEST_P(CopyTo, WithOutMask)
+GPU_TEST_P(CopyTo, WithOutMask)
 {
     cv::Mat src = randomMat(size, type);
 
@@ -195,7 +193,7 @@ TEST_P(CopyTo, WithOutMask)
     EXPECT_MAT_NEAR(src, dst, 0.0);
 }
 
-TEST_P(CopyTo, Masked)
+GPU_TEST_P(CopyTo, Masked)
 {
     cv::Mat src = randomMat(size, type);
     cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
@@ -255,7 +253,7 @@ PARAM_TEST_CASE(ConvertTo, cv::gpu::DeviceInfo, cv::Size, MatDepth, MatDepth, Us
     }
 };
 
-TEST_P(ConvertTo, WithOutScaling)
+GPU_TEST_P(ConvertTo, WithOutScaling)
 {
     cv::Mat src = randomMat(size, depth1);
 
@@ -285,7 +283,7 @@ TEST_P(ConvertTo, WithOutScaling)
     }
 }
 
-TEST_P(ConvertTo, WithScaling)
+GPU_TEST_P(ConvertTo, WithScaling)
 {
     cv::Mat src = randomMat(size, depth1);
     double a = randomDouble(0.0, 1.0);
@@ -324,6 +322,38 @@ INSTANTIATE_TEST_CASE_P(GPU_GpuMat, ConvertTo, testing::Combine(
     ALL_DEPTH,
     WHOLE_SUBMAT));
 
-} // namespace
+////////////////////////////////////////////////////////////////////////////////
+// ensureSizeIsEnough
+
+struct EnsureSizeIsEnough : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    virtual void SetUp()
+    {
+        cv::gpu::DeviceInfo devInfo = GetParam();
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(EnsureSizeIsEnough, BufferReuse)
+{
+    cv::gpu::GpuMat buffer(100, 100, CV_8U);
+    cv::gpu::GpuMat old = buffer;
+
+    // don't reallocate memory
+    cv::gpu::ensureSizeIsEnough(10, 20, CV_8U, buffer);
+    EXPECT_EQ(10, buffer.rows);
+    EXPECT_EQ(20, buffer.cols);
+    EXPECT_EQ(CV_8UC1, buffer.type());
+    EXPECT_EQ(reinterpret_cast<intptr_t>(old.data), reinterpret_cast<intptr_t>(buffer.data));
+
+    // don't reallocate memory
+    cv::gpu::ensureSizeIsEnough(20, 30, CV_8U, buffer);
+    EXPECT_EQ(20, buffer.rows);
+    EXPECT_EQ(30, buffer.cols);
+    EXPECT_EQ(CV_8UC1, buffer.type());
+    EXPECT_EQ(reinterpret_cast<intptr_t>(old.data), reinterpret_cast<intptr_t>(buffer.data));
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_GpuMat, EnsureSizeIsEnough, ALL_DEVICES);
 
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_hough.cpp b/modules/gpu/test/test_hough.cpp
index e6cb4fa852..9044e5b0d0 100644
--- a/modules/gpu/test/test_hough.cpp
+++ b/modules/gpu/test/test_hough.cpp
@@ -43,8 +43,6 @@
 
 #ifdef HAVE_CUDA
 
-namespace {
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HoughLines
 
@@ -79,7 +77,7 @@ PARAM_TEST_CASE(HoughLines, cv::gpu::DeviceInfo, cv::Size, UseRoi)
     }
 };
 
-TEST_P(HoughLines, Accuracy)
+GPU_TEST_P(HoughLines, Accuracy)
 {
     const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
     cv::gpu::setDevice(devInfo.deviceID());
@@ -87,7 +85,7 @@ TEST_P(HoughLines, Accuracy)
     const bool useRoi = GET_PARAM(2);
 
     const float rho = 1.0f;
-    const float theta = 1.5f * CV_PI / 180.0f;
+    const float theta = (float) (1.5 * CV_PI / 180.0);
     const int threshold = 100;
 
     cv::Mat src(size, CV_8UC1);
@@ -124,7 +122,7 @@ PARAM_TEST_CASE(HoughCircles, cv::gpu::DeviceInfo, cv::Size, UseRoi)
     }
 };
 
-TEST_P(HoughCircles, Accuracy)
+GPU_TEST_P(HoughCircles, Accuracy)
 {
     const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
     cv::gpu::setDevice(devInfo.deviceID());
@@ -188,7 +186,7 @@ PARAM_TEST_CASE(GeneralizedHough, cv::gpu::DeviceInfo, UseRoi)
 {
 };
 
-TEST_P(GeneralizedHough, POSITION)
+GPU_TEST_P(GeneralizedHough, POSITION)
 {
     const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
     cv::gpu::setDevice(devInfo.deviceID());
@@ -251,6 +249,4 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, GeneralizedHough, testing::Combine(
     ALL_DEVICES,
     WHOLE_SUBMAT));
 
-} // namespace
-
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp
index e77cad69af..41a94299bf 100644
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -43,8 +43,6 @@
 
 #ifdef HAVE_CUDA
 
-namespace {
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Integral
 
@@ -64,7 +62,7 @@ PARAM_TEST_CASE(Integral, cv::gpu::DeviceInfo, cv::Size, UseRoi)
     }
 };
 
-TEST_P(Integral, Accuracy)
+GPU_TEST_P(Integral, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_8UC1);
 
@@ -97,7 +95,7 @@ struct HistEven : testing::TestWithParam<cv::gpu::DeviceInfo>
     }
 };
 
-TEST_P(HistEven, Accuracy)
+GPU_TEST_P(HistEven, Accuracy)
 {
     cv::Mat img = readImage("stereobm/aloe-L.png");
     ASSERT_FALSE(img.empty());
@@ -132,18 +130,21 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HistEven, ALL_DEVICES);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // CalcHist
 
-void calcHistGold(const cv::Mat& src, cv::Mat& hist)
+namespace
 {
-    hist.create(1, 256, CV_32SC1);
-    hist.setTo(cv::Scalar::all(0));
-
-    int* hist_row = hist.ptr<int>();
-    for (int y = 0; y < src.rows; ++y)
+    void calcHistGold(const cv::Mat& src, cv::Mat& hist)
     {
-        const uchar* src_row = src.ptr(y);
+        hist.create(1, 256, CV_32SC1);
+        hist.setTo(cv::Scalar::all(0));
 
-        for (int x = 0; x < src.cols; ++x)
-            ++hist_row[src_row[x]];
+        int* hist_row = hist.ptr<int>();
+        for (int y = 0; y < src.rows; ++y)
+        {
+            const uchar* src_row = src.ptr(y);
+
+            for (int x = 0; x < src.cols; ++x)
+                ++hist_row[src_row[x]];
+        }
     }
 }
 
@@ -162,7 +163,7 @@ PARAM_TEST_CASE(CalcHist, cv::gpu::DeviceInfo, cv::Size)
     }
 };
 
-TEST_P(CalcHist, Accuracy)
+GPU_TEST_P(CalcHist, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_8UC1);
 
@@ -196,7 +197,7 @@ PARAM_TEST_CASE(EqualizeHist, cv::gpu::DeviceInfo, cv::Size)
     }
 };
 
-TEST_P(EqualizeHist, Accuracy)
+GPU_TEST_P(EqualizeHist, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_8UC1);
 
@@ -230,7 +231,7 @@ PARAM_TEST_CASE(ColumnSum, cv::gpu::DeviceInfo, cv::Size)
     }
 };
 
-TEST_P(ColumnSum, Accuracy)
+GPU_TEST_P(ColumnSum, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_32FC1);
 
@@ -264,8 +265,11 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ColumnSum, testing::Combine(
 ////////////////////////////////////////////////////////
 // Canny
 
-IMPLEMENT_PARAM_CLASS(AppertureSize, int);
-IMPLEMENT_PARAM_CLASS(L2gradient, bool);
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(AppertureSize, int);
+    IMPLEMENT_PARAM_CLASS(L2gradient, bool);
+}
 
 PARAM_TEST_CASE(Canny, cv::gpu::DeviceInfo, AppertureSize, L2gradient, UseRoi)
 {
@@ -285,7 +289,7 @@ PARAM_TEST_CASE(Canny, cv::gpu::DeviceInfo, AppertureSize, L2gradient, UseRoi)
     }
 };
 
-TEST_P(Canny, Accuracy)
+GPU_TEST_P(Canny, Accuracy)
 {
     cv::Mat img = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(img.empty());
@@ -313,7 +317,7 @@ TEST_P(Canny, Accuracy)
         cv::Mat edges_gold;
         cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, useL2gradient);
 
-        EXPECT_MAT_SIMILAR(edges_gold, edges, 1e-2);
+        EXPECT_MAT_SIMILAR(edges_gold, edges, 2e-2);
     }
 }
 
@@ -349,7 +353,7 @@ struct MeanShift : testing::TestWithParam<cv::gpu::DeviceInfo>
     }
 };
 
-TEST_P(MeanShift, Filtering)
+GPU_TEST_P(MeanShift, Filtering)
 {
     cv::Mat img_template;
     if (supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
@@ -371,7 +375,7 @@ TEST_P(MeanShift, Filtering)
     EXPECT_MAT_NEAR(img_template, result, 0.0);
 }
 
-TEST_P(MeanShift, Proc)
+GPU_TEST_P(MeanShift, Proc)
 {
     cv::FileStorage fs;
     if (supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
@@ -402,7 +406,10 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MeanShift, ALL_DEVICES);
 ////////////////////////////////////////////////////////////////////////////////
 // MeanShiftSegmentation
 
-IMPLEMENT_PARAM_CLASS(MinSize, int);
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(MinSize, int);
+}
 
 PARAM_TEST_CASE(MeanShiftSegmentation, cv::gpu::DeviceInfo, MinSize)
 {
@@ -418,7 +425,7 @@ PARAM_TEST_CASE(MeanShiftSegmentation, cv::gpu::DeviceInfo, MinSize)
     }
 };
 
-TEST_P(MeanShiftSegmentation, Regression)
+GPU_TEST_P(MeanShiftSegmentation, Regression)
 {
     cv::Mat img = readImageType("meanshift/cones.png", CV_8UC4);
     ASSERT_FALSE(img.empty());
@@ -448,26 +455,29 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MeanShiftSegmentation, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////
 // Blend
 
-template <typename T>
-void blendLinearGold(const cv::Mat& img1, const cv::Mat& img2, const cv::Mat& weights1, const cv::Mat& weights2, cv::Mat& result_gold)
+namespace
 {
-    result_gold.create(img1.size(), img1.type());
-
-    int cn = img1.channels();
-
-    for (int y = 0; y < img1.rows; ++y)
+    template <typename T>
+    void blendLinearGold(const cv::Mat& img1, const cv::Mat& img2, const cv::Mat& weights1, const cv::Mat& weights2, cv::Mat& result_gold)
     {
-        const float* weights1_row = weights1.ptr<float>(y);
-        const float* weights2_row = weights2.ptr<float>(y);
-        const T* img1_row = img1.ptr<T>(y);
-        const T* img2_row = img2.ptr<T>(y);
-        T* result_gold_row = result_gold.ptr<T>(y);
+        result_gold.create(img1.size(), img1.type());
 
-        for (int x = 0; x < img1.cols * cn; ++x)
+        int cn = img1.channels();
+
+        for (int y = 0; y < img1.rows; ++y)
         {
-            float w1 = weights1_row[x / cn];
-            float w2 = weights2_row[x / cn];
-            result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
+            const float* weights1_row = weights1.ptr<float>(y);
+            const float* weights2_row = weights2.ptr<float>(y);
+            const T* img1_row = img1.ptr<T>(y);
+            const T* img2_row = img2.ptr<T>(y);
+            T* result_gold_row = result_gold.ptr<T>(y);
+
+            for (int x = 0; x < img1.cols * cn; ++x)
+            {
+                float w1 = weights1_row[x / cn];
+                float w2 = weights2_row[x / cn];
+                result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
+            }
         }
     }
 }
@@ -490,7 +500,7 @@ PARAM_TEST_CASE(Blend, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
     }
 };
 
-TEST_P(Blend, Accuracy)
+GPU_TEST_P(Blend, Accuracy)
 {
     int depth = CV_MAT_DEPTH(type);
 
@@ -520,48 +530,51 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, testing::Combine(
 ////////////////////////////////////////////////////////
 // Convolve
 
-void convolveDFT(const cv::Mat& A, const cv::Mat& B, cv::Mat& C, bool ccorr = false)
+namespace
 {
-    // reallocate the output array if needed
-    C.create(std::abs(A.rows - B.rows) + 1, std::abs(A.cols - B.cols) + 1, A.type());
-    cv::Size dftSize;
+    void convolveDFT(const cv::Mat& A, const cv::Mat& B, cv::Mat& C, bool ccorr = false)
+    {
+        // reallocate the output array if needed
+        C.create(std::abs(A.rows - B.rows) + 1, std::abs(A.cols - B.cols) + 1, A.type());
+        cv::Size dftSize;
 
-    // compute the size of DFT transform
-    dftSize.width = cv::getOptimalDFTSize(A.cols + B.cols - 1);
-    dftSize.height = cv::getOptimalDFTSize(A.rows + B.rows - 1);
+        // compute the size of DFT transform
+        dftSize.width = cv::getOptimalDFTSize(A.cols + B.cols - 1);
+        dftSize.height = cv::getOptimalDFTSize(A.rows + B.rows - 1);
 
-    // allocate temporary buffers and initialize them with 0s
-    cv::Mat tempA(dftSize, A.type(), cv::Scalar::all(0));
-    cv::Mat tempB(dftSize, B.type(), cv::Scalar::all(0));
+        // allocate temporary buffers and initialize them with 0s
+        cv::Mat tempA(dftSize, A.type(), cv::Scalar::all(0));
+        cv::Mat tempB(dftSize, B.type(), cv::Scalar::all(0));
 
-    // copy A and B to the top-left corners of tempA and tempB, respectively
-    cv::Mat roiA(tempA, cv::Rect(0, 0, A.cols, A.rows));
-    A.copyTo(roiA);
-    cv::Mat roiB(tempB, cv::Rect(0, 0, B.cols, B.rows));
-    B.copyTo(roiB);
+        // copy A and B to the top-left corners of tempA and tempB, respectively
+        cv::Mat roiA(tempA, cv::Rect(0, 0, A.cols, A.rows));
+        A.copyTo(roiA);
+        cv::Mat roiB(tempB, cv::Rect(0, 0, B.cols, B.rows));
+        B.copyTo(roiB);
 
-    // now transform the padded A & B in-place;
-    // use "nonzeroRows" hint for faster processing
-    cv::dft(tempA, tempA, 0, A.rows);
-    cv::dft(tempB, tempB, 0, B.rows);
+        // now transform the padded A & B in-place;
+        // use "nonzeroRows" hint for faster processing
+        cv::dft(tempA, tempA, 0, A.rows);
+        cv::dft(tempB, tempB, 0, B.rows);
 
-    // multiply the spectrums;
-    // the function handles packed spectrum representations well
-    cv::mulSpectrums(tempA, tempB, tempA, 0, ccorr);
+        // multiply the spectrums;
+        // the function handles packed spectrum representations well
+        cv::mulSpectrums(tempA, tempB, tempA, 0, ccorr);
 
-    // transform the product back from the frequency domain.
-    // Even though all the result rows will be non-zero,
-    // you need only the first C.rows of them, and thus you
-    // pass nonzeroRows == C.rows
-    cv::dft(tempA, tempA, cv::DFT_INVERSE + cv::DFT_SCALE, C.rows);
+        // transform the product back from the frequency domain.
+        // Even though all the result rows will be non-zero,
+        // you need only the first C.rows of them, and thus you
+        // pass nonzeroRows == C.rows
+        cv::dft(tempA, tempA, cv::DFT_INVERSE + cv::DFT_SCALE, C.rows);
 
-    // now copy the result back to C.
-    tempA(cv::Rect(0, 0, C.cols, C.rows)).copyTo(C);
+        // now copy the result back to C.
+        tempA(cv::Rect(0, 0, C.cols, C.rows)).copyTo(C);
+    }
+
+    IMPLEMENT_PARAM_CLASS(KSize, int);
+    IMPLEMENT_PARAM_CLASS(Ccorr, bool);
 }
 
-IMPLEMENT_PARAM_CLASS(KSize, int);
-IMPLEMENT_PARAM_CLASS(Ccorr, bool);
-
 PARAM_TEST_CASE(Convolve, cv::gpu::DeviceInfo, cv::Size, KSize, Ccorr)
 {
     cv::gpu::DeviceInfo devInfo;
@@ -580,7 +593,7 @@ PARAM_TEST_CASE(Convolve, cv::gpu::DeviceInfo, cv::Size, KSize, Ccorr)
     }
 };
 
-TEST_P(Convolve, Accuracy)
+GPU_TEST_P(Convolve, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0);
     cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0);
@@ -606,7 +619,10 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Convolve, testing::Combine(
 CV_ENUM(TemplateMethod, cv::TM_SQDIFF, cv::TM_SQDIFF_NORMED, cv::TM_CCORR, cv::TM_CCORR_NORMED, cv::TM_CCOEFF, cv::TM_CCOEFF_NORMED)
 #define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_CCOEFF_NORMED))
 
-IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
+}
 
 PARAM_TEST_CASE(MatchTemplate8U, cv::gpu::DeviceInfo, cv::Size, TemplateSize, Channels, TemplateMethod)
 {
@@ -628,7 +644,7 @@ PARAM_TEST_CASE(MatchTemplate8U, cv::gpu::DeviceInfo, cv::Size, TemplateSize, Ch
     }
 };
 
-TEST_P(MatchTemplate8U, Accuracy)
+GPU_TEST_P(MatchTemplate8U, Accuracy)
 {
     cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
     cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
@@ -674,7 +690,7 @@ PARAM_TEST_CASE(MatchTemplate32F, cv::gpu::DeviceInfo, cv::Size, TemplateSize, C
     }
 };
 
-TEST_P(MatchTemplate32F, Regression)
+GPU_TEST_P(MatchTemplate32F, Regression)
 {
     cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
     cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
@@ -712,7 +728,7 @@ PARAM_TEST_CASE(MatchTemplateBlackSource, cv::gpu::DeviceInfo, TemplateMethod)
     }
 };
 
-TEST_P(MatchTemplateBlackSource, Accuracy)
+GPU_TEST_P(MatchTemplateBlackSource, Accuracy)
 {
     cv::Mat image = readImage("matchtemplate/black.png");
     ASSERT_FALSE(image.empty());
@@ -757,7 +773,7 @@ PARAM_TEST_CASE(MatchTemplate_CCOEF_NORMED, cv::gpu::DeviceInfo, std::pair<std::
     }
 };
 
-TEST_P(MatchTemplate_CCOEF_NORMED, Accuracy)
+GPU_TEST_P(MatchTemplate_CCOEF_NORMED, Accuracy)
 {
     cv::Mat image = readImage(imageName);
     ASSERT_FALSE(image.empty());
@@ -806,7 +822,7 @@ struct MatchTemplate_CanFindBigTemplate : testing::TestWithParam<cv::gpu::Device
     }
 };
 
-TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF_NORMED)
+GPU_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF_NORMED)
 {
     cv::Mat scene = readImage("matchtemplate/scene.png");
     ASSERT_FALSE(scene.empty());
@@ -829,7 +845,7 @@ TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF_NORMED)
     ASSERT_EQ(0, minLoc.y);
 }
 
-TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF)
+GPU_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF)
 {
     cv::Mat scene = readImage("matchtemplate/scene.png");
     ASSERT_FALSE(scene.empty());
@@ -879,7 +895,7 @@ PARAM_TEST_CASE(MulSpectrums, cv::gpu::DeviceInfo, cv::Size, DftFlags)
     }
 };
 
-TEST_P(MulSpectrums, Simple)
+GPU_TEST_P(MulSpectrums, Simple)
 {
     cv::gpu::GpuMat c;
     cv::gpu::mulSpectrums(loadMat(a), loadMat(b), c, flag, false);
@@ -890,7 +906,7 @@ TEST_P(MulSpectrums, Simple)
     EXPECT_MAT_NEAR(c_gold, c, 1e-2);
 }
 
-TEST_P(MulSpectrums, Scaled)
+GPU_TEST_P(MulSpectrums, Scaled)
 {
     float scale = 1.f / size.area();
 
@@ -924,31 +940,34 @@ struct Dft : testing::TestWithParam<cv::gpu::DeviceInfo>
     }
 };
 
-void testC2C(const std::string& hint, int cols, int rows, int flags, bool inplace)
+namespace
 {
-    SCOPED_TRACE(hint);
-
-    cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC2, 0.0, 10.0);
-
-    cv::Mat b_gold;
-    cv::dft(a, b_gold, flags);
-
-    cv::gpu::GpuMat d_b;
-    cv::gpu::GpuMat d_b_data;
-    if (inplace)
+    void testC2C(const std::string& hint, int cols, int rows, int flags, bool inplace)
     {
-        d_b_data.create(1, a.size().area(), CV_32FC2);
-        d_b = cv::gpu::GpuMat(a.rows, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize());
-    }
-    cv::gpu::dft(loadMat(a), d_b, cv::Size(cols, rows), flags);
+        SCOPED_TRACE(hint);
 
-    EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
-    ASSERT_EQ(CV_32F, d_b.depth());
-    ASSERT_EQ(2, d_b.channels());
-    EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4);
+        cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC2, 0.0, 10.0);
+
+        cv::Mat b_gold;
+        cv::dft(a, b_gold, flags);
+
+        cv::gpu::GpuMat d_b;
+        cv::gpu::GpuMat d_b_data;
+        if (inplace)
+        {
+            d_b_data.create(1, a.size().area(), CV_32FC2);
+            d_b = cv::gpu::GpuMat(a.rows, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize());
+        }
+        cv::gpu::dft(loadMat(a), d_b, cv::Size(cols, rows), flags);
+
+        EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
+        ASSERT_EQ(CV_32F, d_b.depth());
+        ASSERT_EQ(2, d_b.channels());
+        EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4);
+    }
 }
 
-TEST_P(Dft, C2C)
+GPU_TEST_P(Dft, C2C)
 {
     int cols = randomInt(2, 100);
     int rows = randomInt(2, 100);
@@ -973,43 +992,46 @@ TEST_P(Dft, C2C)
     }
 }
 
-void testR2CThenC2R(const std::string& hint, int cols, int rows, bool inplace)
+namespace
 {
-    SCOPED_TRACE(hint);
-
-    cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC1, 0.0, 10.0);
-
-    cv::gpu::GpuMat d_b, d_c;
-    cv::gpu::GpuMat d_b_data, d_c_data;
-    if (inplace)
+    void testR2CThenC2R(const std::string& hint, int cols, int rows, bool inplace)
     {
-        if (a.cols == 1)
+        SCOPED_TRACE(hint);
+
+        cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC1, 0.0, 10.0);
+
+        cv::gpu::GpuMat d_b, d_c;
+        cv::gpu::GpuMat d_b_data, d_c_data;
+        if (inplace)
         {
-            d_b_data.create(1, (a.rows / 2 + 1) * a.cols, CV_32FC2);
-            d_b = cv::gpu::GpuMat(a.rows / 2 + 1, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize());
+            if (a.cols == 1)
+            {
+                d_b_data.create(1, (a.rows / 2 + 1) * a.cols, CV_32FC2);
+                d_b = cv::gpu::GpuMat(a.rows / 2 + 1, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize());
+            }
+            else
+            {
+                d_b_data.create(1, a.rows * (a.cols / 2 + 1), CV_32FC2);
+                d_b = cv::gpu::GpuMat(a.rows, a.cols / 2 + 1, CV_32FC2, d_b_data.ptr(), (a.cols / 2 + 1) * d_b_data.elemSize());
+            }
+            d_c_data.create(1, a.size().area(), CV_32F);
+            d_c = cv::gpu::GpuMat(a.rows, a.cols, CV_32F, d_c_data.ptr(), a.cols * d_c_data.elemSize());
         }
-        else
-        {
-            d_b_data.create(1, a.rows * (a.cols / 2 + 1), CV_32FC2);
-            d_b = cv::gpu::GpuMat(a.rows, a.cols / 2 + 1, CV_32FC2, d_b_data.ptr(), (a.cols / 2 + 1) * d_b_data.elemSize());
-        }
-        d_c_data.create(1, a.size().area(), CV_32F);
-        d_c = cv::gpu::GpuMat(a.rows, a.cols, CV_32F, d_c_data.ptr(), a.cols * d_c_data.elemSize());
+
+        cv::gpu::dft(loadMat(a), d_b, cv::Size(cols, rows), 0);
+        cv::gpu::dft(d_b, d_c, cv::Size(cols, rows), cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
+
+        EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
+        EXPECT_TRUE(!inplace || d_c.ptr() == d_c_data.ptr());
+        ASSERT_EQ(CV_32F, d_c.depth());
+        ASSERT_EQ(1, d_c.channels());
+
+        cv::Mat c(d_c);
+        EXPECT_MAT_NEAR(a, c, rows * cols * 1e-5);
     }
-
-    cv::gpu::dft(loadMat(a), d_b, cv::Size(cols, rows), 0);
-    cv::gpu::dft(d_b, d_c, cv::Size(cols, rows), cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
-
-    EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
-    EXPECT_TRUE(!inplace || d_c.ptr() == d_c_data.ptr());
-    ASSERT_EQ(CV_32F, d_c.depth());
-    ASSERT_EQ(1, d_c.channels());
-
-    cv::Mat c(d_c);
-    EXPECT_MAT_NEAR(a, c, rows * cols * 1e-5);
 }
 
-TEST_P(Dft, R2CThenC2R)
+GPU_TEST_P(Dft, R2CThenC2R)
 {
     int cols = randomInt(2, 100);
     int rows = randomInt(2, 100);
@@ -1036,8 +1058,11 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Dft, ALL_DEVICES);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // CornerHarris
 
-IMPLEMENT_PARAM_CLASS(BlockSize, int);
-IMPLEMENT_PARAM_CLASS(ApertureSize, int);
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(BlockSize, int);
+    IMPLEMENT_PARAM_CLASS(ApertureSize, int);
+}
 
 PARAM_TEST_CASE(CornerHarris, cv::gpu::DeviceInfo, MatType, BorderType, BlockSize, ApertureSize)
 {
@@ -1059,7 +1084,7 @@ PARAM_TEST_CASE(CornerHarris, cv::gpu::DeviceInfo, MatType, BorderType, BlockSiz
     }
 };
 
-TEST_P(CornerHarris, Accuracy)
+GPU_TEST_P(CornerHarris, Accuracy)
 {
     cv::Mat src = readImageType("stereobm/aloe-L.png", type);
     ASSERT_FALSE(src.empty());
@@ -1105,7 +1130,7 @@ PARAM_TEST_CASE(CornerMinEigen, cv::gpu::DeviceInfo, MatType, BorderType, BlockS
     }
 };
 
-TEST_P(CornerMinEigen, Accuracy)
+GPU_TEST_P(CornerMinEigen, Accuracy)
 {
     cv::Mat src = readImageType("stereobm/aloe-L.png", type);
     ASSERT_FALSE(src.empty());
@@ -1126,6 +1151,4 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CornerMinEigen, testing::Combine(
     testing::Values(BlockSize(3), BlockSize(5), BlockSize(7)),
     testing::Values(ApertureSize(0), ApertureSize(3), ApertureSize(5), ApertureSize(7))));
 
-} // namespace
-
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_labeling.cpp b/modules/gpu/test/test_labeling.cpp
index c56fc0ef83..b19fd2e1b4 100644
--- a/modules/gpu/test/test_labeling.cpp
+++ b/modules/gpu/test/test_labeling.cpp
@@ -43,8 +43,8 @@
 
 #ifdef HAVE_CUDA
 
-namespace {
-
+namespace
+{
     struct GreedyLabeling
     {
         struct dot
@@ -82,7 +82,7 @@ namespace {
             int cc = -1;
 
             int* dist_labels = (int*)labels.data;
-            int pitch = labels.step1();
+            int pitch = (int) labels.step1();
 
             unsigned char* source = (unsigned char*)image.data;
             int width = image.cols;
@@ -166,7 +166,7 @@ struct Labeling : testing::TestWithParam<cv::gpu::DeviceInfo>
     }
 };
 
-TEST_P(Labeling, ConnectedComponents)
+GPU_TEST_P(Labeling, ConnectedComponents)
 {
     cv::Mat image;
     cvtColor(loat_image(), image, CV_BGR2GRAY);
@@ -191,6 +191,6 @@ TEST_P(Labeling, ConnectedComponents)
     host.checkCorrectness(cv::Mat(components));
 }
 
-INSTANTIATE_TEST_CASE_P(ConnectedComponents, Labeling, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(GPU_ConnectedComponents, Labeling, ALL_DEVICES);
 
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_nvidia.cpp b/modules/gpu/test/test_nvidia.cpp
index 484f3cfbba..53ed8c2235 100644
--- a/modules/gpu/test/test_nvidia.cpp
+++ b/modules/gpu/test/test_nvidia.cpp
@@ -41,11 +41,9 @@
 
 #include "test_precomp.hpp"
 
-#if defined HAVE_CUDA
-  OutputLevel nvidiaTestOutputLevel = OutputLevelNone;
-#endif
+#ifdef HAVE_CUDA
 
-#if defined HAVE_CUDA && !defined(CUDA_DISABLER)
+OutputLevel nvidiaTestOutputLevel = OutputLevelNone;
 
 using namespace cvtest;
 using namespace testing;
@@ -69,77 +67,77 @@ struct NVidiaTest : TestWithParam<cv::gpu::DeviceInfo>
 struct NPPST : NVidiaTest {};
 struct NCV : NVidiaTest {};
 
-//TEST_P(NPPST, Integral)
-//{
-//    bool res = nvidia_NPPST_Integral_Image(path, nvidiaTestOutputLevel);
+GPU_TEST_P(NPPST, Integral)
+{
+    bool res = nvidia_NPPST_Integral_Image(_path, nvidiaTestOutputLevel);
 
-//    ASSERT_TRUE(res);
-//}
+    ASSERT_TRUE(res);
+}
 
-TEST_P(NPPST, SquaredIntegral)
+GPU_TEST_P(NPPST, SquaredIntegral)
 {
     bool res = nvidia_NPPST_Squared_Integral_Image(_path, nvidiaTestOutputLevel);
 
     ASSERT_TRUE(res);
 }
 
-TEST_P(NPPST, RectStdDev)
+GPU_TEST_P(NPPST, RectStdDev)
 {
     bool res = nvidia_NPPST_RectStdDev(_path, nvidiaTestOutputLevel);
 
     ASSERT_TRUE(res);
 }
 
-TEST_P(NPPST, Resize)
+GPU_TEST_P(NPPST, Resize)
 {
     bool res = nvidia_NPPST_Resize(_path, nvidiaTestOutputLevel);
 
     ASSERT_TRUE(res);
 }
 
-TEST_P(NPPST, VectorOperations)
+GPU_TEST_P(NPPST, VectorOperations)
 {
     bool res = nvidia_NPPST_Vector_Operations(_path, nvidiaTestOutputLevel);
 
     ASSERT_TRUE(res);
 }
 
-TEST_P(NPPST, Transpose)
+GPU_TEST_P(NPPST, Transpose)
 {
     bool res = nvidia_NPPST_Transpose(_path, nvidiaTestOutputLevel);
 
     ASSERT_TRUE(res);
 }
 
-TEST_P(NCV, VectorOperations)
+GPU_TEST_P(NCV, VectorOperations)
 {
     bool res = nvidia_NCV_Vector_Operations(_path, nvidiaTestOutputLevel);
 
     ASSERT_TRUE(res);
 }
 
-TEST_P(NCV, HaarCascadeLoader)
+GPU_TEST_P(NCV, HaarCascadeLoader)
 {
     bool res = nvidia_NCV_Haar_Cascade_Loader(_path, nvidiaTestOutputLevel);
 
     ASSERT_TRUE(res);
 }
 
-TEST_P(NCV, HaarCascadeApplication)
+GPU_TEST_P(NCV, HaarCascadeApplication)
 {
     bool res = nvidia_NCV_Haar_Cascade_Application(_path, nvidiaTestOutputLevel);
 
     ASSERT_TRUE(res);
 }
 
-TEST_P(NCV, HypothesesFiltration)
+GPU_TEST_P(NCV, HypothesesFiltration)
 {
     bool res = nvidia_NCV_Hypotheses_Filtration(_path, nvidiaTestOutputLevel);
 
     ASSERT_TRUE(res);
 }
 
-TEST_P(NCV, Visualization)
+GPU_TEST_P(NCV, Visualization)
 {
     // this functionality doesn't used in gpu module
     bool res = nvidia_NCV_Visualization(_path, nvidiaTestOutputLevel);
diff --git a/modules/gpu/test/test_objdetect.cpp b/modules/gpu/test/test_objdetect.cpp
index 4fb295d4c7..fd610d9b50 100644
--- a/modules/gpu/test/test_objdetect.cpp
+++ b/modules/gpu/test/test_objdetect.cpp
@@ -43,8 +43,6 @@
 
 #ifdef HAVE_CUDA
 
-namespace {
-
 //#define DUMP
 
 struct HOG : testing::TestWithParam<cv::gpu::DeviceInfo>, cv::gpu::HOGDescriptor
@@ -176,7 +174,7 @@ struct HOG : testing::TestWithParam<cv::gpu::DeviceInfo>, cv::gpu::HOGDescriptor
 };
 
 // desabled while resize does not fixed
-TEST_P(HOG, DISABLED_Detect)
+GPU_TEST_P(HOG, Detect)
 {
     cv::Mat img_rgb = readImage("hog/road.png");
     ASSERT_FALSE(img_rgb.empty());
@@ -201,7 +199,7 @@ TEST_P(HOG, DISABLED_Detect)
     f.close();
 }
 
-TEST_P(HOG, GetDescriptors)
+GPU_TEST_P(HOG, GetDescriptors)
 {
     // Load image (e.g. train data, composed from windows)
     cv::Mat img_rgb = readImage("hog/train_data.png");
@@ -288,6 +286,7 @@ TEST_P(HOG, GetDescriptors)
 INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, HOG, ALL_DEVICES);
 
 //============== caltech hog tests =====================//
+
 struct CalTech : public ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string> >
 {
     cv::gpu::DeviceInfo devInfo;
@@ -303,7 +302,7 @@ struct CalTech : public ::testing::TestWithParam<std::tr1::tuple<cv::gpu::Device
     }
 };
 
-TEST_P(CalTech, HOG)
+GPU_TEST_P(CalTech, HOG)
 {
     cv::gpu::GpuMat d_img(img);
     cv::Mat markedImage(img.clone());
@@ -350,7 +349,7 @@ PARAM_TEST_CASE(LBP_Read_classifier, cv::gpu::DeviceInfo, int)
     }
 };
 
-TEST_P(LBP_Read_classifier, Accuracy)
+GPU_TEST_P(LBP_Read_classifier, Accuracy)
 {
     cv::gpu::CascadeClassifier_GPU classifier;
     std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml";
@@ -372,7 +371,7 @@ PARAM_TEST_CASE(LBP_classify, cv::gpu::DeviceInfo, int)
     }
 };
 
-TEST_P(LBP_classify, Accuracy)
+GPU_TEST_P(LBP_classify, Accuracy)
 {
     std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml";
     std::string imagePath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/er.png";
@@ -422,6 +421,4 @@ TEST_P(LBP_classify, Accuracy)
 INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, LBP_classify,
                         testing::Combine(ALL_DEVICES, testing::Values<int>(0)));
 
-} // namespace
-
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_opengl.cpp b/modules/gpu/test/test_opengl.cpp
new file mode 100644
index 0000000000..9b69db6d44
--- /dev/null
+++ b/modules/gpu/test/test_opengl.cpp
@@ -0,0 +1,508 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#if defined(HAVE_CUDA) && defined(HAVE_OPENGL)
+
+/////////////////////////////////////////////
+// GlBuffer
+
+PARAM_TEST_CASE(GlBuffer, cv::Size, MatType)
+{
+    static void SetUpTestCase()
+    {
+        cv::namedWindow("test", cv::WINDOW_OPENGL);
+    }
+
+    static void TearDownTestCase()
+    {
+        cv::destroyAllWindows();
+    }
+
+    cv::Size size;
+    int type;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+        type = GET_PARAM(1);
+    }
+};
+
+GPU_TEST_P(GlBuffer, Constructor1)
+{
+    cv::GlBuffer buf(size.height, size.width, type, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    EXPECT_EQ(size.height, buf.rows());
+    EXPECT_EQ(size.width, buf.cols());
+    EXPECT_EQ(type, buf.type());
+}
+
+GPU_TEST_P(GlBuffer, Constructor2)
+{
+    cv::GlBuffer buf(size, type, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    EXPECT_EQ(size.height, buf.rows());
+    EXPECT_EQ(size.width, buf.cols());
+    EXPECT_EQ(type, buf.type());
+}
+
+GPU_TEST_P(GlBuffer, ConstructorFromMat)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::GlBuffer buf(gold, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(GlBuffer, ConstructorFromGpuMat)
+{
+    cv::Mat gold = randomMat(size, type);
+    cv::gpu::GpuMat d_gold(gold);
+
+    cv::GlBuffer buf(d_gold, cv::GlBuffer::ARRAY_BUFFER);
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(GlBuffer, ConstructorFromGlBuffer)
+{
+    cv::GlBuffer buf_gold(size, type, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    cv::GlBuffer buf(buf_gold);
+
+    EXPECT_EQ(buf_gold.bufId(), buf.bufId());
+    EXPECT_EQ(buf_gold.rows(), buf.rows());
+    EXPECT_EQ(buf_gold.cols(), buf.cols());
+    EXPECT_EQ(buf_gold.type(), buf.type());
+}
+
+GPU_TEST_P(GlBuffer, ConstructorFromGlTexture2D)
+{
+    const int depth = CV_MAT_DEPTH(type);
+    const int cn = CV_MAT_CN(type);
+
+    if (depth != CV_32F || cn == 2)
+        return;
+
+    cv::Mat gold = randomMat(size, type, 0, 1.0);
+    cv::GlTexture2D tex_gold(gold, true);
+
+    cv::GlBuffer buf(tex_gold, cv::GlBuffer::PIXEL_PACK_BUFFER, true);
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 1e-2);
+}
+
+GPU_TEST_P(GlBuffer, Create)
+{
+    cv::GlBuffer buf;
+    buf.create(size.height, size.width, type, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    EXPECT_EQ(size.height, buf.rows());
+    EXPECT_EQ(size.width, buf.cols());
+    EXPECT_EQ(type, buf.type());
+}
+
+GPU_TEST_P(GlBuffer, CopyFromMat)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::GlBuffer buf;
+    buf.copyFrom(gold, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(GlBuffer, CopyFromGpuMat)
+{
+    cv::Mat gold = randomMat(size, type);
+    cv::gpu::GpuMat d_gold(gold);
+
+    cv::GlBuffer buf;
+    buf.copyFrom(d_gold, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(GlBuffer, CopyFromGlBuffer)
+{
+    cv::Mat gold = randomMat(size, type);
+    cv::GlBuffer buf_gold(gold, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    cv::GlBuffer buf;
+    buf.copyFrom(buf_gold, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    EXPECT_NE(buf_gold.bufId(), buf.bufId());
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(GlBuffer, CopyFromGlTexture2D)
+{
+    const int depth = CV_MAT_DEPTH(type);
+    const int cn = CV_MAT_CN(type);
+
+    if (depth != CV_32F || cn == 2)
+        return;
+
+    cv::Mat gold = randomMat(size, type, 0, 1.0);
+    cv::GlTexture2D tex_gold(gold, true);
+
+    cv::GlBuffer buf;
+    buf.copyFrom(tex_gold, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 1e-2);
+}
+
+GPU_TEST_P(GlBuffer, CopyToGpuMat)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::GlBuffer buf(gold, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    cv::gpu::GpuMat dst;
+    buf.copyTo(dst);
+
+    EXPECT_MAT_NEAR(gold, dst, 0);
+}
+
+GPU_TEST_P(GlBuffer, CopyToGlBuffer)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::GlBuffer buf(gold, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    cv::GlBuffer dst;
+    buf.copyTo(dst, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    EXPECT_NE(buf.bufId(), dst.bufId());
+
+    cv::Mat bufData;
+    dst.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(GlBuffer, CopyToGlTexture2D)
+{
+    const int depth = CV_MAT_DEPTH(type);
+    const int cn = CV_MAT_CN(type);
+
+    if (depth != CV_32F || cn == 2)
+        return;
+
+    cv::Mat gold = randomMat(size, type, 0, 1.0);
+
+    cv::GlBuffer buf(gold, cv::GlBuffer::PIXEL_PACK_BUFFER, true);
+
+    cv::GlTexture2D tex;
+    buf.copyTo(tex, cv::GlBuffer::PIXEL_PACK_BUFFER, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+GPU_TEST_P(GlBuffer, Clone)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::GlBuffer buf(gold, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    cv::GlBuffer dst = buf.clone(cv::GlBuffer::ARRAY_BUFFER, true);
+
+    EXPECT_NE(buf.bufId(), dst.bufId());
+
+    cv::Mat bufData;
+    dst.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(GlBuffer, MapHostRead)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::GlBuffer buf(gold, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    cv::Mat dst = buf.mapHost(cv::GlBuffer::READ_ONLY);
+
+    EXPECT_MAT_NEAR(gold, dst, 0);
+
+    buf.unmapHost();
+}
+
+GPU_TEST_P(GlBuffer, MapHostWrite)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::GlBuffer buf(size, type, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    cv::Mat dst = buf.mapHost(cv::GlBuffer::WRITE_ONLY);
+    gold.copyTo(dst);
+    buf.unmapHost();
+    dst.release();
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(GlBuffer, MapDevice)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::GlBuffer buf(gold, cv::GlBuffer::ARRAY_BUFFER, true);
+
+    cv::gpu::GpuMat dst = buf.mapDevice();
+
+    EXPECT_MAT_NEAR(gold, dst, 0);
+
+    buf.unmapDevice();
+}
+
+INSTANTIATE_TEST_CASE_P(OpenGL, GlBuffer, testing::Combine(DIFFERENT_SIZES, ALL_TYPES));
+
+/////////////////////////////////////////////
+// GlTexture2D
+
+PARAM_TEST_CASE(GlTexture2D, cv::Size, MatType)
+{
+    static void SetUpTestCase()
+    {
+        cv::namedWindow("test", cv::WINDOW_OPENGL);
+    }
+
+    static void TearDownTestCase()
+    {
+        cv::destroyAllWindows();
+    }
+
+    cv::Size size;
+    int type;
+    int depth;
+    int cn;
+    cv::GlTexture2D::Format format;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+        type = GET_PARAM(1);
+
+        depth = CV_MAT_DEPTH(type);
+        cn = CV_MAT_CN(type);
+        format = cn == 1 ? cv::GlTexture2D::DEPTH_COMPONENT : cn == 3 ? cv::GlTexture2D::RGB : cn == 4 ? cv::GlTexture2D::RGBA : cv::GlTexture2D::NONE;
+    }
+};
+
+GPU_TEST_P(GlTexture2D, Constructor1)
+{
+    cv::GlTexture2D tex(size.height, size.width, format, true);
+
+    EXPECT_EQ(size.height, tex.rows());
+    EXPECT_EQ(size.width, tex.cols());
+    EXPECT_EQ(format, tex.format());
+}
+
+GPU_TEST_P(GlTexture2D, Constructor2)
+{
+    cv::GlTexture2D tex(size, format, true);
+
+    EXPECT_EQ(size.height, tex.rows());
+    EXPECT_EQ(size.width, tex.cols());
+    EXPECT_EQ(format, tex.format());
+}
+
+GPU_TEST_P(GlTexture2D, ConstructorFromMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+
+    cv::GlTexture2D tex(gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+GPU_TEST_P(GlTexture2D, ConstructorFromGpuMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+    cv::gpu::GpuMat d_gold(gold);
+
+    cv::GlTexture2D tex(d_gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+GPU_TEST_P(GlTexture2D, ConstructorFromGlBuffer)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+    cv::GlBuffer buf_gold(gold, cv::GlBuffer::PIXEL_UNPACK_BUFFER, true);
+
+    cv::GlTexture2D tex(buf_gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+GPU_TEST_P(GlTexture2D, ConstructorFromGlTexture2D)
+{
+    cv::GlTexture2D tex_gold(size, format, true);
+    cv::GlTexture2D tex(tex_gold);
+
+    EXPECT_EQ(tex_gold.texId(), tex.texId());
+    EXPECT_EQ(tex_gold.rows(), tex.rows());
+    EXPECT_EQ(tex_gold.cols(), tex.cols());
+    EXPECT_EQ(tex_gold.format(), tex.format());
+}
+
+GPU_TEST_P(GlTexture2D, Create)
+{
+    cv::GlTexture2D tex;
+    tex.create(size.height, size.width, format, true);
+
+    EXPECT_EQ(size.height, tex.rows());
+    EXPECT_EQ(size.width, tex.cols());
+    EXPECT_EQ(format, tex.format());
+}
+
+GPU_TEST_P(GlTexture2D, CopyFromMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+
+    cv::GlTexture2D tex;
+    tex.copyFrom(gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+GPU_TEST_P(GlTexture2D, CopyFromGpuMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+    cv::gpu::GpuMat d_gold(gold);
+
+    cv::GlTexture2D tex;
+    tex.copyFrom(d_gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+GPU_TEST_P(GlTexture2D, CopyFromGlBuffer)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+    cv::GlBuffer buf_gold(gold, cv::GlBuffer::PIXEL_UNPACK_BUFFER, true);
+
+    cv::GlTexture2D tex;
+    tex.copyFrom(buf_gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+GPU_TEST_P(GlTexture2D, CopyToGpuMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+
+    cv::GlTexture2D tex(gold, true);
+
+    cv::gpu::GpuMat dst;
+    tex.copyTo(dst, depth);
+
+    EXPECT_MAT_NEAR(gold, dst, 1e-2);
+}
+
+GPU_TEST_P(GlTexture2D, CopyToGlBuffer)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+
+    cv::GlTexture2D tex(gold, true);
+
+    cv::GlBuffer dst;
+    tex.copyTo(dst, depth, true);
+
+    cv::Mat bufData;
+    dst.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(OpenGL, GlTexture2D, testing::Combine(DIFFERENT_SIZES, testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4)));
+
+#endif
diff --git a/modules/gpu/test/test_optflow.cpp b/modules/gpu/test/test_optflow.cpp
new file mode 100644
index 0000000000..c93ebbe19e
--- /dev/null
+++ b/modules/gpu/test/test_optflow.cpp
@@ -0,0 +1,623 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+//////////////////////////////////////////////////////
+// BroxOpticalFlow
+
+//#define BROX_DUMP
+
+struct BroxOpticalFlow : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(BroxOpticalFlow, Regression)
+{
+    cv::Mat frame0 = readImageType("opticalflow/frame0.png", CV_32FC1);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImageType("opticalflow/frame1.png", CV_32FC1);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::gpu::BroxOpticalFlow brox(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
+                                  10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
+
+    cv::gpu::GpuMat u;
+    cv::gpu::GpuMat v;
+    brox(loadMat(frame0), loadMat(frame1), u, v);
+
+    std::string fname(cvtest::TS::ptr()->get_data_path());
+    if (devInfo.majorVersion() >= 2)
+        fname += "opticalflow/brox_optical_flow_cc20.bin";
+    else
+        fname += "opticalflow/brox_optical_flow.bin";
+
+#ifndef BROX_DUMP
+    std::ifstream f(fname.c_str(), std::ios_base::binary);
+
+    int rows, cols;
+
+    f.read((char*) &rows, sizeof(rows));
+    f.read((char*) &cols, sizeof(cols));
+
+    cv::Mat u_gold(rows, cols, CV_32FC1);
+
+    for (int i = 0; i < u_gold.rows; ++i)
+        f.read(u_gold.ptr<char>(i), u_gold.cols * sizeof(float));
+
+    cv::Mat v_gold(rows, cols, CV_32FC1);
+
+    for (int i = 0; i < v_gold.rows; ++i)
+        f.read(v_gold.ptr<char>(i), v_gold.cols * sizeof(float));
+
+    EXPECT_MAT_NEAR(u_gold, u, 0);
+    EXPECT_MAT_NEAR(v_gold, v, 0);
+#else
+    std::ofstream f(fname.c_str(), std::ios_base::binary);
+
+    f.write((char*) &u.rows, sizeof(u.rows));
+    f.write((char*) &u.cols, sizeof(u.cols));
+
+    cv::Mat h_u(u);
+    cv::Mat h_v(v);
+
+    for (int i = 0; i < u.rows; ++i)
+        f.write(h_u.ptr<char>(i), u.cols * sizeof(float));
+
+    for (int i = 0; i < v.rows; ++i)
+        f.write(h_v.ptr<char>(i), v.cols * sizeof(float));
+#endif
+}
+
+GPU_TEST_P(BroxOpticalFlow, OpticalFlowNan)
+{
+    cv::Mat frame0 = readImageType("opticalflow/frame0.png", CV_32FC1);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImageType("opticalflow/frame1.png", CV_32FC1);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Mat r_frame0, r_frame1;
+    cv::resize(frame0, r_frame0, cv::Size(1380,1000));
+    cv::resize(frame1, r_frame1, cv::Size(1380,1000));
+
+    cv::gpu::BroxOpticalFlow brox(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
+                                  5 /*inner_iterations*/, 150 /*outer_iterations*/, 10 /*solver_iterations*/);
+
+    cv::gpu::GpuMat u;
+    cv::gpu::GpuMat v;
+    brox(loadMat(r_frame0), loadMat(r_frame1), u, v);
+
+    cv::Mat h_u, h_v;
+    u.download(h_u);
+    v.download(h_v);
+
+    EXPECT_TRUE(cv::checkRange(h_u));
+    EXPECT_TRUE(cv::checkRange(h_v));
+};
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, BroxOpticalFlow, ALL_DEVICES);
+
+//////////////////////////////////////////////////////
+// GoodFeaturesToTrack
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(MinDistance, double)
+}
+
+PARAM_TEST_CASE(GoodFeaturesToTrack, cv::gpu::DeviceInfo, MinDistance)
+{
+    cv::gpu::DeviceInfo devInfo;
+    double minDistance;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        minDistance = GET_PARAM(1);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(GoodFeaturesToTrack, Accuracy)
+{
+    cv::Mat image = readImage("opticalflow/frame0.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    int maxCorners = 1000;
+    double qualityLevel = 0.01;
+
+    cv::gpu::GoodFeaturesToTrackDetector_GPU detector(maxCorners, qualityLevel, minDistance);
+
+    cv::gpu::GpuMat d_pts;
+    detector(loadMat(image), d_pts);
+
+    ASSERT_FALSE(d_pts.empty());
+
+    std::vector<cv::Point2f> pts(d_pts.cols);
+    cv::Mat pts_mat(1, d_pts.cols, CV_32FC2, (void*) &pts[0]);
+    d_pts.download(pts_mat);
+
+    std::vector<cv::Point2f> pts_gold;
+    cv::goodFeaturesToTrack(image, pts_gold, maxCorners, qualityLevel, minDistance);
+
+    ASSERT_EQ(pts_gold.size(), pts.size());
+
+    size_t mistmatch = 0;
+    for (size_t i = 0; i < pts.size(); ++i)
+    {
+        cv::Point2i a = pts_gold[i];
+        cv::Point2i b = pts[i];
+
+        bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
+
+        if (!eq)
+            ++mistmatch;
+    }
+
+    double bad_ratio = static_cast<double>(mistmatch) / pts.size();
+
+    ASSERT_LE(bad_ratio, 0.01);
+}
+
+GPU_TEST_P(GoodFeaturesToTrack, EmptyCorners)
+{
+    int maxCorners = 1000;
+    double qualityLevel = 0.01;
+
+    cv::gpu::GoodFeaturesToTrackDetector_GPU detector(maxCorners, qualityLevel, minDistance);
+
+    cv::gpu::GpuMat src(100, 100, CV_8UC1, cv::Scalar::all(0));
+    cv::gpu::GpuMat corners(1, maxCorners, CV_32FC2);
+
+    detector(src, corners);
+
+    ASSERT_TRUE(corners.empty());
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, GoodFeaturesToTrack, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MinDistance(0.0), MinDistance(3.0))));
+
+//////////////////////////////////////////////////////
+// PyrLKOpticalFlow
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(UseGray, bool)
+}
+
+PARAM_TEST_CASE(PyrLKOpticalFlow, cv::gpu::DeviceInfo, UseGray)
+{
+    cv::gpu::DeviceInfo devInfo;
+    bool useGray;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        useGray = GET_PARAM(1);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(PyrLKOpticalFlow, Sparse)
+{
+    cv::Mat frame0 = readImage("opticalflow/frame0.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/frame1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Mat gray_frame;
+    if (useGray)
+        gray_frame = frame0;
+    else
+        cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
+
+    std::vector<cv::Point2f> pts;
+    cv::goodFeaturesToTrack(gray_frame, pts, 1000, 0.01, 0.0);
+
+    cv::gpu::GpuMat d_pts;
+    cv::Mat pts_mat(1, (int) pts.size(), CV_32FC2, (void*) &pts[0]);
+    d_pts.upload(pts_mat);
+
+    cv::gpu::PyrLKOpticalFlow pyrLK;
+
+    cv::gpu::GpuMat d_nextPts;
+    cv::gpu::GpuMat d_status;
+    pyrLK.sparse(loadMat(frame0), loadMat(frame1), d_pts, d_nextPts, d_status);
+
+    std::vector<cv::Point2f> nextPts(d_nextPts.cols);
+    cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void*) &nextPts[0]);
+    d_nextPts.download(nextPts_mat);
+
+    std::vector<unsigned char> status(d_status.cols);
+    cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void*) &status[0]);
+    d_status.download(status_mat);
+
+    std::vector<cv::Point2f> nextPts_gold;
+    std::vector<unsigned char> status_gold;
+    cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, cv::noArray());
+
+    ASSERT_EQ(nextPts_gold.size(), nextPts.size());
+    ASSERT_EQ(status_gold.size(), status.size());
+
+    size_t mistmatch = 0;
+    for (size_t i = 0; i < nextPts.size(); ++i)
+    {
+        cv::Point2i a = nextPts[i];
+        cv::Point2i b = nextPts_gold[i];
+
+        if (status[i] != status_gold[i])
+        {
+            ++mistmatch;
+            continue;
+        }
+
+        if (status[i])
+        {
+            bool eq = std::abs(a.x - b.x) <= 1 && std::abs(a.y - b.y) <= 1;
+
+            if (!eq)
+                ++mistmatch;
+        }
+    }
+
+    double bad_ratio = static_cast<double>(mistmatch) / nextPts.size();
+
+    ASSERT_LE(bad_ratio, 0.01);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, PyrLKOpticalFlow, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(UseGray(true), UseGray(false))));
+
+//////////////////////////////////////////////////////
+// FarnebackOpticalFlow
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(PyrScale, double)
+    IMPLEMENT_PARAM_CLASS(PolyN, int)
+    CV_FLAGS(FarnebackOptFlowFlags, 0, cv::OPTFLOW_FARNEBACK_GAUSSIAN)
+    IMPLEMENT_PARAM_CLASS(UseInitFlow, bool)
+}
+
+PARAM_TEST_CASE(FarnebackOpticalFlow, cv::gpu::DeviceInfo, PyrScale, PolyN, FarnebackOptFlowFlags, UseInitFlow)
+{
+    cv::gpu::DeviceInfo devInfo;
+    double pyrScale;
+    int polyN;
+    int flags;
+    bool useInitFlow;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        pyrScale = GET_PARAM(1);
+        polyN = GET_PARAM(2);
+        flags = GET_PARAM(3);
+        useInitFlow = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(FarnebackOpticalFlow, Accuracy)
+{
+    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    double polySigma = polyN <= 5 ? 1.1 : 1.5;
+
+    cv::gpu::FarnebackOpticalFlow farn;
+    farn.pyrScale = pyrScale;
+    farn.polyN = polyN;
+    farn.polySigma = polySigma;
+    farn.flags = flags;
+
+    cv::gpu::GpuMat d_flowx, d_flowy;
+    farn(loadMat(frame0), loadMat(frame1), d_flowx, d_flowy);
+
+    cv::Mat flow;
+    if (useInitFlow)
+    {
+        cv::Mat flowxy[] = {cv::Mat(d_flowx), cv::Mat(d_flowy)};
+        cv::merge(flowxy, 2, flow);
+
+        farn.flags |= cv::OPTFLOW_USE_INITIAL_FLOW;
+        farn(loadMat(frame0), loadMat(frame1), d_flowx, d_flowy);
+    }
+
+    cv::calcOpticalFlowFarneback(
+        frame0, frame1, flow, farn.pyrScale, farn.numLevels, farn.winSize,
+        farn.numIters, farn.polyN, farn.polySigma, farn.flags);
+
+    std::vector<cv::Mat> flowxy;
+    cv::split(flow, flowxy);
+
+    EXPECT_MAT_SIMILAR(flowxy[0], d_flowx, 0.1);
+    EXPECT_MAT_SIMILAR(flowxy[1], d_flowy, 0.1);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, FarnebackOpticalFlow, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(PyrScale(0.3), PyrScale(0.5), PyrScale(0.8)),
+    testing::Values(PolyN(5), PolyN(7)),
+    testing::Values(FarnebackOptFlowFlags(0), FarnebackOptFlowFlags(cv::OPTFLOW_FARNEBACK_GAUSSIAN)),
+    testing::Values(UseInitFlow(false), UseInitFlow(true))));
+
+//////////////////////////////////////////////////////
+// OpticalFlowDual_TVL1
+
+PARAM_TEST_CASE(OpticalFlowDual_TVL1, cv::gpu::DeviceInfo, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        useRoi = GET_PARAM(1);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(OpticalFlowDual_TVL1, Accuracy)
+{
+    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::gpu::OpticalFlowDual_TVL1_GPU d_alg;
+    cv::gpu::GpuMat d_flowx = createMat(frame0.size(), CV_32FC1, useRoi);
+    cv::gpu::GpuMat d_flowy = createMat(frame0.size(), CV_32FC1, useRoi);
+    d_alg(loadMat(frame0, useRoi), loadMat(frame1, useRoi), d_flowx, d_flowy);
+
+    cv::OpticalFlowDual_TVL1 alg;
+    cv::Mat flow;
+    alg(frame0, frame1, flow);
+    cv::Mat gold[2];
+    cv::split(flow, gold);
+
+    EXPECT_MAT_SIMILAR(gold[0], d_flowx, 3e-3);
+    EXPECT_MAT_SIMILAR(gold[1], d_flowy, 3e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, OpticalFlowDual_TVL1, testing::Combine(
+    ALL_DEVICES,
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////
+// OpticalFlowBM
+
+namespace
+{
+    void calcOpticalFlowBM(const cv::Mat& prev, const cv::Mat& curr,
+                           cv::Size bSize, cv::Size shiftSize, cv::Size maxRange, int usePrevious,
+                           cv::Mat& velx, cv::Mat& vely)
+    {
+        cv::Size sz((curr.cols - bSize.width + shiftSize.width)/shiftSize.width, (curr.rows - bSize.height + shiftSize.height)/shiftSize.height);
+
+        velx.create(sz, CV_32FC1);
+        vely.create(sz, CV_32FC1);
+
+        CvMat cvprev = prev;
+        CvMat cvcurr = curr;
+
+        CvMat cvvelx = velx;
+        CvMat cvvely = vely;
+
+        cvCalcOpticalFlowBM(&cvprev, &cvcurr, bSize, shiftSize, maxRange, usePrevious, &cvvelx, &cvvely);
+    }
+}
+
+struct OpticalFlowBM : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+};
+
+GPU_TEST_P(OpticalFlowBM, Accuracy)
+{
+    cv::gpu::DeviceInfo devInfo = GetParam();
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Size block_size(16, 16);
+    cv::Size shift_size(1, 1);
+    cv::Size max_range(16, 16);
+
+    cv::gpu::GpuMat d_velx, d_vely, buf;
+    cv::gpu::calcOpticalFlowBM(loadMat(frame0), loadMat(frame1),
+                               block_size, shift_size, max_range, false,
+                               d_velx, d_vely, buf);
+
+    cv::Mat velx, vely;
+    calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
+
+    EXPECT_MAT_NEAR(velx, d_velx, 0);
+    EXPECT_MAT_NEAR(vely, d_vely, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, OpticalFlowBM, ALL_DEVICES);
+
+//////////////////////////////////////////////////////
+// FastOpticalFlowBM
+
+namespace
+{
+    void FastOpticalFlowBM_gold(const cv::Mat_<uchar>& I0, const cv::Mat_<uchar>& I1, cv::Mat_<float>& velx, cv::Mat_<float>& vely, int search_window, int block_window)
+    {
+        velx.create(I0.size());
+        vely.create(I0.size());
+
+        int search_radius = search_window / 2;
+        int block_radius = block_window / 2;
+
+        for (int y = 0; y < I0.rows; ++y)
+        {
+            for (int x = 0; x < I0.cols; ++x)
+            {
+                int bestDist = std::numeric_limits<int>::max();
+                int bestDx = 0;
+                int bestDy = 0;
+
+                for (int dy = -search_radius; dy <= search_radius; ++dy)
+                {
+                    for (int dx = -search_radius; dx <= search_radius; ++dx)
+                    {
+                        int dist = 0;
+
+                        for (int by = -block_radius; by <= block_radius; ++by)
+                        {
+                            for (int bx = -block_radius; bx <= block_radius; ++bx)
+                            {
+                                int I0_val = I0(cv::borderInterpolate(y + by, I0.rows, cv::BORDER_DEFAULT), cv::borderInterpolate(x + bx, I0.cols, cv::BORDER_DEFAULT));
+                                int I1_val = I1(cv::borderInterpolate(y + dy + by, I0.rows, cv::BORDER_DEFAULT), cv::borderInterpolate(x + dx + bx, I0.cols, cv::BORDER_DEFAULT));
+
+                                dist += std::abs(I0_val - I1_val);
+                            }
+                        }
+
+                        if (dist < bestDist)
+                        {
+                            bestDist = dist;
+                            bestDx = dx;
+                            bestDy = dy;
+                        }
+                    }
+                }
+
+                velx(y, x) = (float) bestDx;
+                vely(y, x) = (float) bestDy;
+            }
+        }
+    }
+
+    double calc_rmse(const cv::Mat_<float>& flow1, const cv::Mat_<float>& flow2)
+    {
+        double sum = 0.0;
+
+        for (int y = 0; y < flow1.rows; ++y)
+        {
+            for (int x = 0; x < flow1.cols; ++x)
+            {
+                double diff = flow1(y, x) - flow2(y, x);
+                sum += diff * diff;
+            }
+        }
+
+        return std::sqrt(sum / flow1.size().area());
+    }
+}
+
+struct FastOpticalFlowBM : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+};
+
+GPU_TEST_P(FastOpticalFlowBM, Accuracy)
+{
+    const double MAX_RMSE = 0.6;
+
+    int search_window = 15;
+    int block_window = 5;
+
+    cv::gpu::DeviceInfo devInfo = GetParam();
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Size smallSize(320, 240);
+    cv::Mat frame0_small;
+    cv::Mat frame1_small;
+
+    cv::resize(frame0, frame0_small, smallSize);
+    cv::resize(frame1, frame1_small, smallSize);
+
+    cv::gpu::GpuMat d_flowx;
+    cv::gpu::GpuMat d_flowy;
+    cv::gpu::FastOpticalFlowBM fastBM;
+
+    fastBM(loadMat(frame0_small), loadMat(frame1_small), d_flowx, d_flowy, search_window, block_window);
+
+    cv::Mat_<float> flowx;
+    cv::Mat_<float> flowy;
+    FastOpticalFlowBM_gold(frame0_small, frame1_small, flowx, flowy, search_window, block_window);
+
+    double err;
+
+    err = calc_rmse(flowx, cv::Mat(d_flowx));
+    EXPECT_LE(err, MAX_RMSE);
+
+    err = calc_rmse(flowy, cv::Mat(d_flowy));
+    EXPECT_LE(err, MAX_RMSE);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, FastOpticalFlowBM, ALL_DEVICES);
+
+#endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_precomp.hpp b/modules/gpu/test/test_precomp.hpp
index b75f8edd88..e7ade6aed7 100644
--- a/modules/gpu/test/test_precomp.hpp
+++ b/modules/gpu/test/test_precomp.hpp
@@ -51,6 +51,7 @@
 #define __OPENCV_TEST_PRECOMP_HPP__
 
 #include <cmath>
+#include <ctime>
 #include <cstdio>
 #include <iostream>
 #include <fstream>
@@ -69,6 +70,7 @@
     #include <cuda_runtime.h>
 
     #include "opencv2/core/core.hpp"
+    #include "opencv2/core/opengl_interop.hpp"
     #include "opencv2/highgui/highgui.hpp"
     #include "opencv2/calib3d/calib3d.hpp"
     #include "opencv2/imgproc/imgproc.hpp"
diff --git a/modules/gpu/test/test_pyramids.cpp b/modules/gpu/test/test_pyramids.cpp
index 1abd7841ef..c3d56b63a0 100644
--- a/modules/gpu/test/test_pyramids.cpp
+++ b/modules/gpu/test/test_pyramids.cpp
@@ -64,7 +64,7 @@ PARAM_TEST_CASE(PyrDown, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
     }
 };
 
-TEST_P(PyrDown, Accuracy)
+GPU_TEST_P(PyrDown, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
 
@@ -104,7 +104,7 @@ PARAM_TEST_CASE(PyrUp, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
     }
 };
 
-TEST_P(PyrUp, Accuracy)
+GPU_TEST_P(PyrUp, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
 
diff --git a/modules/gpu/test/test_remap.cpp b/modules/gpu/test/test_remap.cpp
index 978e1044a7..a815ed0e0d 100644
--- a/modules/gpu/test/test_remap.cpp
+++ b/modules/gpu/test/test_remap.cpp
@@ -152,7 +152,7 @@ PARAM_TEST_CASE(Remap, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, Bo
     }
 };
 
-TEST_P(Remap, Accuracy)
+GPU_TEST_P(Remap, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
     cv::Scalar val = randomScalar(0.0, 255.0);
diff --git a/modules/gpu/test/test_resize.cpp b/modules/gpu/test/test_resize.cpp
index cae2dbf411..a34da54fdb 100644
--- a/modules/gpu/test/test_resize.cpp
+++ b/modules/gpu/test/test_resize.cpp
@@ -136,7 +136,7 @@ PARAM_TEST_CASE(Resize, cv::gpu::DeviceInfo, cv::Size, MatType, double, Interpol
     }
 };
 
-TEST_P(Resize, Accuracy)
+GPU_TEST_P(Resize, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
 
@@ -157,8 +157,8 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Resize, testing::Combine(
     testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
     WHOLE_SUBMAT));
 
-
 /////////////////
+
 PARAM_TEST_CASE(ResizeSameAsHost, cv::gpu::DeviceInfo, cv::Size, MatType, double, Interpolation, UseRoi)
 {
     cv::gpu::DeviceInfo devInfo;
@@ -182,7 +182,7 @@ PARAM_TEST_CASE(ResizeSameAsHost, cv::gpu::DeviceInfo, cv::Size, MatType, double
 };
 
 // downscaling only: used for classifiers
-TEST_P(ResizeSameAsHost, Accuracy)
+GPU_TEST_P(ResizeSameAsHost, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
 
@@ -224,7 +224,7 @@ PARAM_TEST_CASE(ResizeNPP, cv::gpu::DeviceInfo, MatType, double, Interpolation)
     }
 };
 
-TEST_P(ResizeNPP, Accuracy)
+GPU_TEST_P(ResizeNPP, Accuracy)
 {
     cv::Mat src = readImageType("stereobp/aloe-L.png", type);
     ASSERT_FALSE(src.empty());
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
new file mode 100644
index 0000000000..9cc1a5e397
--- /dev/null
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -0,0 +1,314 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using cv::gpu::GpuMat;
+
+// show detection results on input image with cv::imshow
+//#define SHOW_DETECTIONS
+
+#if defined SHOW_DETECTIONS
+# define SHOW(res)           \
+    cv::imshow(#res, result);\
+    cv::waitKey(0);
+#else
+# define SHOW(res)
+#endif
+
+TEST(SCascadeTest, readCascade)
+{
+    std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/icf-template.xml";
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(xml, cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+}
+
+namespace
+{
+    typedef cv::gpu::SCascade::Detection Detection;
+
+    cv::Rect getFromTable(int idx)
+    {
+        static const cv::Rect rois[] =
+        {
+            cv::Rect( 65 * 4,  20 * 4,  35 * 4, 80 * 4),
+            cv::Rect( 95 * 4,  35 * 4,  45 * 4, 40 * 4),
+            cv::Rect( 45 * 4,  35 * 4,  45 * 4, 40 * 4),
+            cv::Rect( 25 * 4,  27 * 4,  50 * 4, 45 * 4),
+            cv::Rect(100 * 4,  50 * 4,  45 * 4, 40 * 4),
+
+            cv::Rect( 60 * 4,  30 * 4,  45 * 4, 40 * 4),
+            cv::Rect( 40 * 4,  55 * 4,  50 * 4, 40 * 4),
+            cv::Rect( 48 * 4,  37 * 4,  72 * 4, 80 * 4),
+            cv::Rect( 48 * 4,  32 * 4,  85 * 4, 58 * 4),
+            cv::Rect( 48 * 4,   0 * 4,  32 * 4, 27 * 4)
+        };
+
+        return rois[idx];
+    }
+
+    std::string itoa(long i)
+    {
+        static char s[65];
+        sprintf(s, "%ld", i);
+        return std::string(s);
+    }
+
+    void print(std::ostream &out, const Detection& d)
+    {
+    #if defined SHOW_DETECTIONS
+        out << "\x1b[32m[ detection]\x1b[0m ("
+            << std::setw(4)  << d.x
+            << " "
+            << std::setw(4)  << d.y
+            << ") ("
+            << std::setw(4)  << d.w
+            << " "
+            << std::setw(4)  << d.h
+            << ") "
+            << std::setw(12) << d.confidence
+            <<  std::endl;
+    #else
+        (void)out; (void)d;
+    #endif
+    }
+
+    void printTotal(std::ostream &out, int detbytes)
+    {
+    #if defined SHOW_DETECTIONS
+        out << "\x1b[32m[          ]\x1b[0m Total detections " << (detbytes / sizeof(Detection)) << std::endl;
+    #else
+        (void)out; (void)detbytes;
+    #endif
+    }
+
+#if defined SHOW_DETECTIONS
+    std::string getImageName(int level)
+    {
+        time_t rawtime;
+        struct tm * timeinfo;
+        char buffer [80];
+
+        time ( &rawtime );
+        timeinfo = localtime ( &rawtime );
+
+        strftime (buffer,80,"%Y-%m-%d--%H-%M-%S",timeinfo);
+        return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png";
+    }
+
+    void writeResult(const cv::Mat& result, const int level)
+    {
+        std::string path = cv::tempfile(getImageName(level).c_str());
+        cv::imwrite(path, result);
+        std::cout << "\x1b[32m" << "[          ]" << std::endl << "[ stored in]"<< "\x1b[0m" << path << std::endl;
+    }
+#endif
+}
+
+PARAM_TEST_CASE(SCascadeTestRoi, cv::gpu::DeviceInfo, std::string, std::string, int)
+{
+};
+
+GPU_TEST_P(SCascadeTestRoi, Detect)
+{
+    cv::gpu::setDevice(GET_PARAM(0).deviceID());
+    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2));
+    ASSERT_FALSE(coloredCpu.empty());
+
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(colored.size(), CV_8UC1);
+    rois.setTo(0);
+
+    int nroi = GET_PARAM(3);
+    cv::Mat result(coloredCpu);
+    cv::RNG rng;
+    for (int i = 0; i < nroi; ++i)
+    {
+        cv::Rect r = getFromTable(rng(10));
+        GpuMat sub(rois, r);
+        sub.setTo(1);
+        cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1);
+    }
+    objectBoxes.setTo(0);
+
+    cascade.detect(colored, rois, objectBoxes);
+
+    cv::Mat dt(objectBoxes);
+    typedef cv::gpu::SCascade::Detection Detection;
+
+    Detection* dts = ((Detection*)dt.data) + 1;
+    int* count = dt.ptr<int>(0);
+
+    printTotal(std::cout, *count);
+
+    for (int i = 0; i  < *count; ++i)
+    {
+        Detection d = dts[i];
+        print(std::cout, d);
+        cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
+    }
+
+    SHOW(result);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_SoftCascade, SCascadeTestRoi, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+    testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")),
+    testing::Range(0, 5)));
+
+struct SCascadeTestAll : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    virtual void SetUp()
+    {
+        cv::gpu::setDevice(GetParam().deviceID());
+    }
+};
+
+GPU_TEST_P(SCascadeTestAll, detect)
+{
+    std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(xml, cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path()
+        + "../cv/cascadeandhog/bahnhof/image_00000000_0.png");
+    ASSERT_FALSE(coloredCpu.empty());
+
+    GpuMat colored(coloredCpu), objectBoxes, rois(colored.size(), CV_8UC1);
+    rois.setTo(0);
+    GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
+    sub.setTo(cv::Scalar::all(1));
+
+    cascade.detect(colored, rois, objectBoxes);
+
+    typedef cv::gpu::SCascade::Detection Detection;
+    cv::Mat detections(objectBoxes);
+    int a = *(detections.ptr<int>(0));
+    ASSERT_EQ(a, 2448);
+}
+
+GPU_TEST_P(SCascadeTestAll, detectOnIntegral)
+{
+    std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(xml, cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    std::string intPath = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/integrals.xml";
+    cv::FileStorage fsi(intPath, cv::FileStorage::READ);
+    ASSERT_TRUE(fsi.isOpened());
+
+    GpuMat hogluv(121 * 10, 161, CV_32SC1);
+    for (int i = 0; i < 10; ++i)
+    {
+        cv::Mat channel;
+        fsi[std::string("channel") + itoa(i)] >> channel;
+        GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121));
+        gchannel.upload(channel);
+    }
+
+    GpuMat objectBoxes(1, 100000, CV_8UC1), rois(cv::Size(640, 480), CV_8UC1);
+    rois.setTo(1);
+
+    objectBoxes.setTo(0);
+    cascade.detect(hogluv, rois, objectBoxes);
+
+    typedef cv::gpu::SCascade::Detection Detection;
+    cv::Mat detections(objectBoxes);
+    int a = *(detections.ptr<int>(0));
+    ASSERT_EQ(a, 1024);
+}
+
+GPU_TEST_P(SCascadeTestAll, detectStream)
+{
+    std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(xml, cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path()
+        + "../cv/cascadeandhog/bahnhof/image_00000000_0.png");
+    ASSERT_FALSE(coloredCpu.empty());
+
+    GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
+    rois.setTo(0);
+    GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
+    sub.setTo(cv::Scalar::all(1));
+
+    cv::gpu::Stream s;
+
+    objectBoxes.setTo(0);
+    cascade.detect(colored, rois, objectBoxes, s);
+    s.waitForCompletion();
+
+    typedef cv::gpu::SCascade::Detection Detection;
+    cv::Mat detections(objectBoxes);
+    int a = *(detections.ptr<int>(0));
+    ASSERT_EQ(a, 2448);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_SoftCascade, SCascadeTestAll, ALL_DEVICES);
+
+#endif
diff --git a/modules/gpu/test/test_threshold.cpp b/modules/gpu/test/test_threshold.cpp
index c569210456..43e651ad81 100644
--- a/modules/gpu/test/test_threshold.cpp
+++ b/modules/gpu/test/test_threshold.cpp
@@ -66,7 +66,7 @@ PARAM_TEST_CASE(Threshold, cv::gpu::DeviceInfo, cv::Size, MatType, ThreshOp, Use
     }
 };
 
-TEST_P(Threshold, Accuracy)
+GPU_TEST_P(Threshold, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
     double maxVal = randomDouble(20.0, 127.0);
diff --git a/modules/gpu/test/test_video.cpp b/modules/gpu/test/test_video.cpp
index ecba4b5a0f..b9502814a1 100644
--- a/modules/gpu/test/test_video.cpp
+++ b/modules/gpu/test/test_video.cpp
@@ -41,739 +41,47 @@
 
 #include "test_precomp.hpp"
 
-#ifdef HAVE_CUDA
-
-//#define DUMP
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
 
 //////////////////////////////////////////////////////
-// BroxOpticalFlow
+// VideoReader
 
-#define BROX_OPTICAL_FLOW_DUMP_FILE            "opticalflow/brox_optical_flow.bin"
-#define BROX_OPTICAL_FLOW_DUMP_FILE_CC20       "opticalflow/brox_optical_flow_cc20.bin"
-
-struct BroxOpticalFlow : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-TEST_P(BroxOpticalFlow, Regression)
-{
-    cv::Mat frame0 = readImageType("opticalflow/frame0.png", CV_32FC1);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImageType("opticalflow/frame1.png", CV_32FC1);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::gpu::BroxOpticalFlow brox(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
-                                  10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
-
-    cv::gpu::GpuMat u;
-    cv::gpu::GpuMat v;
-    brox(loadMat(frame0), loadMat(frame1), u, v);
-
-#ifndef DUMP
-    std::string fname(cvtest::TS::ptr()->get_data_path());
-    if (devInfo.majorVersion() >= 2)
-        fname += BROX_OPTICAL_FLOW_DUMP_FILE_CC20;
-    else
-        fname += BROX_OPTICAL_FLOW_DUMP_FILE;
-
-    std::ifstream f(fname.c_str(), std::ios_base::binary);
-
-    int rows, cols;
-
-    f.read((char*)&rows, sizeof(rows));
-    f.read((char*)&cols, sizeof(cols));
-
-    cv::Mat u_gold(rows, cols, CV_32FC1);
-
-    for (int i = 0; i < u_gold.rows; ++i)
-        f.read(u_gold.ptr<char>(i), u_gold.cols * sizeof(float));
-
-    cv::Mat v_gold(rows, cols, CV_32FC1);
-
-    for (int i = 0; i < v_gold.rows; ++i)
-        f.read(v_gold.ptr<char>(i), v_gold.cols * sizeof(float));
-
-    EXPECT_MAT_NEAR(u_gold, u, 0);
-    EXPECT_MAT_NEAR(v_gold, v, 0);
-#else
-    std::string fname(cvtest::TS::ptr()->get_data_path());
-    if (devInfo.majorVersion() >= 2)
-        fname += BROX_OPTICAL_FLOW_DUMP_FILE_CC20;
-    else
-        fname += BROX_OPTICAL_FLOW_DUMP_FILE;
-
-    std::ofstream f(fname.c_str(), std::ios_base::binary);
-
-    f.write((char*)&u.rows, sizeof(u.rows));
-    f.write((char*)&u.cols, sizeof(u.cols));
-
-    cv::Mat h_u(u);
-    cv::Mat h_v(v);
-
-    for (int i = 0; i < u.rows; ++i)
-        f.write(h_u.ptr<char>(i), u.cols * sizeof(float));
-
-    for (int i = 0; i < v.rows; ++i)
-        f.write(h_v.ptr<char>(i), v.cols * sizeof(float));
-
-#endif
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, BroxOpticalFlow, ALL_DEVICES);
-
-//////////////////////////////////////////////////////
-// GoodFeaturesToTrack
-
-IMPLEMENT_PARAM_CLASS(MinDistance, double)
-
-PARAM_TEST_CASE(GoodFeaturesToTrack, cv::gpu::DeviceInfo, MinDistance)
-{
-    cv::gpu::DeviceInfo devInfo;
-    double minDistance;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        minDistance = GET_PARAM(1);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-TEST_P(GoodFeaturesToTrack, Accuracy)
-{
-    cv::Mat image = readImage("opticalflow/frame0.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    int maxCorners = 1000;
-    double qualityLevel = 0.01;
-
-    cv::gpu::GoodFeaturesToTrackDetector_GPU detector(maxCorners, qualityLevel, minDistance);
-
-    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
-    {
-        try
-        {
-            cv::gpu::GpuMat d_pts;
-            detector(loadMat(image), d_pts);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(CV_StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        cv::gpu::GpuMat d_pts;
-        detector(loadMat(image), d_pts);
-
-        std::vector<cv::Point2f> pts(d_pts.cols);
-        cv::Mat pts_mat(1, d_pts.cols, CV_32FC2, (void*)&pts[0]);
-        d_pts.download(pts_mat);
-
-        std::vector<cv::Point2f> pts_gold;
-        cv::goodFeaturesToTrack(image, pts_gold, maxCorners, qualityLevel, minDistance);
-
-        ASSERT_EQ(pts_gold.size(), pts.size());
-
-        size_t mistmatch = 0;
-        for (size_t i = 0; i < pts.size(); ++i)
-        {
-            cv::Point2i a = pts_gold[i];
-            cv::Point2i b = pts[i];
-
-            bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
-
-            if (!eq)
-                ++mistmatch;
-        }
-
-        double bad_ratio = static_cast<double>(mistmatch) / pts.size();
-
-        ASSERT_LE(bad_ratio, 0.01);
-    }
-}
-
-TEST_P(GoodFeaturesToTrack, EmptyCorners)
-{
-    int maxCorners = 1000;
-    double qualityLevel = 0.01;
-
-    cv::gpu::GoodFeaturesToTrackDetector_GPU detector(maxCorners, qualityLevel, minDistance);
-
-    cv::gpu::GpuMat src(100, 100, CV_8UC1, cv::Scalar::all(0));
-    cv::gpu::GpuMat corners(1, maxCorners, CV_32FC2);
-
-    detector(src, corners);
-
-    ASSERT_TRUE( corners.empty() );
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, GoodFeaturesToTrack, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MinDistance(0.0), MinDistance(3.0))));
-
-//////////////////////////////////////////////////////
-// PyrLKOpticalFlow
-
-IMPLEMENT_PARAM_CLASS(UseGray, bool)
-
-PARAM_TEST_CASE(PyrLKOpticalFlow, cv::gpu::DeviceInfo, UseGray)
-{
-    cv::gpu::DeviceInfo devInfo;
-    bool useGray;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        useGray = GET_PARAM(1);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-TEST_P(PyrLKOpticalFlow, Sparse)
-{
-    cv::Mat frame0 = readImage("opticalflow/frame0.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("opticalflow/frame1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::Mat gray_frame;
-    if (useGray)
-        gray_frame = frame0;
-    else
-        cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
-
-    std::vector<cv::Point2f> pts;
-    cv::goodFeaturesToTrack(gray_frame, pts, 1000, 0.01, 0.0);
-
-    cv::gpu::GpuMat d_pts;
-    cv::Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void*)&pts[0]);
-    d_pts.upload(pts_mat);
-
-    cv::gpu::PyrLKOpticalFlow pyrLK;
-
-    cv::gpu::GpuMat d_nextPts;
-    cv::gpu::GpuMat d_status;
-    pyrLK.sparse(loadMat(frame0), loadMat(frame1), d_pts, d_nextPts, d_status);
-
-    std::vector<cv::Point2f> nextPts(d_nextPts.cols);
-    cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void*)&nextPts[0]);
-    d_nextPts.download(nextPts_mat);
-
-    std::vector<unsigned char> status(d_status.cols);
-    cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void*)&status[0]);
-    d_status.download(status_mat);
-
-    std::vector<cv::Point2f> nextPts_gold;
-    std::vector<unsigned char> status_gold;
-    cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, cv::noArray());
-
-    ASSERT_EQ(nextPts_gold.size(), nextPts.size());
-    ASSERT_EQ(status_gold.size(), status.size());
-
-    size_t mistmatch = 0;
-    for (size_t i = 0; i < nextPts.size(); ++i)
-    {
-        cv::Point2i a = nextPts[i];
-        cv::Point2i b = nextPts_gold[i];
-
-        if (status[i] != status_gold[i])
-        {
-            ++mistmatch;
-            continue;
-        }
-
-        if (status[i])
-        {
-            bool eq = std::abs(a.x - b.x) <= 1 && std::abs(a.y - b.y) <= 1;
-
-            if (!eq)
-                ++mistmatch;
-        }
-    }
-
-    double bad_ratio = static_cast<double>(mistmatch) / nextPts.size();
-
-    ASSERT_LE(bad_ratio, 0.01);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, PyrLKOpticalFlow, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(UseGray(true), UseGray(false))));
-
-//////////////////////////////////////////////////////
-// FarnebackOpticalFlow
-
-IMPLEMENT_PARAM_CLASS(PyrScale, double)
-IMPLEMENT_PARAM_CLASS(PolyN, int)
-CV_FLAGS(FarnebackOptFlowFlags, 0, cv::OPTFLOW_FARNEBACK_GAUSSIAN)
-IMPLEMENT_PARAM_CLASS(UseInitFlow, bool)
-
-PARAM_TEST_CASE(FarnebackOpticalFlow, cv::gpu::DeviceInfo, PyrScale, PolyN, FarnebackOptFlowFlags, UseInitFlow)
-{
-    cv::gpu::DeviceInfo devInfo;
-    double pyrScale;
-    int polyN;
-    int flags;
-    bool useInitFlow;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        pyrScale = GET_PARAM(1);
-        polyN = GET_PARAM(2);
-        flags = GET_PARAM(3);
-        useInitFlow = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-TEST_P(FarnebackOpticalFlow, Accuracy)
-{
-    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    double polySigma = polyN <= 5 ? 1.1 : 1.5;
-
-    cv::gpu::FarnebackOpticalFlow calc;
-    calc.pyrScale = pyrScale;
-    calc.polyN = polyN;
-    calc.polySigma = polySigma;
-    calc.flags = flags;
-
-    cv::gpu::GpuMat d_flowx, d_flowy;
-    calc(loadMat(frame0), loadMat(frame1), d_flowx, d_flowy);
-
-    cv::Mat flow;
-    if (useInitFlow)
-    {
-        cv::Mat flowxy[] = {cv::Mat(d_flowx), cv::Mat(d_flowy)};
-        cv::merge(flowxy, 2, flow);
-    }
-
-    if (useInitFlow)
-    {
-        calc.flags |= cv::OPTFLOW_USE_INITIAL_FLOW;
-        calc(loadMat(frame0), loadMat(frame1), d_flowx, d_flowy);
-    }
-
-    cv::calcOpticalFlowFarneback(
-        frame0, frame1, flow, calc.pyrScale, calc.numLevels, calc.winSize,
-        calc.numIters,  calc.polyN, calc.polySigma, calc.flags);
-
-    std::vector<cv::Mat> flowxy;
-    cv::split(flow, flowxy);
-
-    EXPECT_MAT_SIMILAR(flowxy[0], d_flowx, 0.1);
-    EXPECT_MAT_SIMILAR(flowxy[1], d_flowy, 0.1);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, FarnebackOpticalFlow, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(PyrScale(0.3), PyrScale(0.5), PyrScale(0.8)),
-    testing::Values(PolyN(5), PolyN(7)),
-    testing::Values(FarnebackOptFlowFlags(0), FarnebackOptFlowFlags(cv::OPTFLOW_FARNEBACK_GAUSSIAN)),
-    testing::Values(UseInitFlow(false), UseInitFlow(true))));
-
-struct OpticalFlowNan : public BroxOpticalFlow {};
-
-TEST_P(OpticalFlowNan, Regression)
-{
-    cv::Mat frame0 = readImageType("opticalflow/frame0.png", CV_32FC1);
-    ASSERT_FALSE(frame0.empty());
-    cv::Mat r_frame0, r_frame1;
-    cv::resize(frame0, r_frame0, cv::Size(1380,1000));
-
-    cv::Mat frame1 = readImageType("opticalflow/frame1.png", CV_32FC1);
-    ASSERT_FALSE(frame1.empty());
-    cv::resize(frame1, r_frame1, cv::Size(1380,1000));
-
-    cv::gpu::BroxOpticalFlow brox(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
-                                  5 /*inner_iterations*/, 150 /*outer_iterations*/, 10 /*solver_iterations*/);
-
-    cv::gpu::GpuMat u;
-    cv::gpu::GpuMat v;
-    brox(loadMat(r_frame0), loadMat(r_frame1), u, v);
-
-    cv::Mat h_u, h_v;
-    u.download(h_u);
-    v.download(h_v);
-    EXPECT_TRUE(cv::checkRange(h_u));
-    EXPECT_TRUE(cv::checkRange(h_v));
-};
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, OpticalFlowNan, ALL_DEVICES);
-
-//////////////////////////////////////////////////////
-// FGDStatModel
-
-namespace cv
-{
-    template<> void Ptr<CvBGStatModel>::delete_obj()
-    {
-        cvReleaseBGStatModel(&obj);
-    }
-}
-
-PARAM_TEST_CASE(FGDStatModel, cv::gpu::DeviceInfo, std::string, Channels)
+PARAM_TEST_CASE(VideoReader, cv::gpu::DeviceInfo, std::string)
 {
     cv::gpu::DeviceInfo devInfo;
     std::string inputFile;
-    int out_cn;
 
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
+        inputFile = GET_PARAM(1);
+
         cv::gpu::setDevice(devInfo.deviceID());
 
-        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
-
-        out_cn = GET_PARAM(2);
+        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + inputFile;
     }
 };
 
-TEST_P(FGDStatModel, Update)
+GPU_TEST_P(VideoReader, Regression)
 {
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
+    cv::gpu::VideoReader_GPU reader(inputFile);
+    ASSERT_TRUE(reader.isOpened());
 
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    IplImage ipl_frame = frame;
-    cv::Ptr<CvBGStatModel> model(cvCreateFGDStatModel(&ipl_frame));
-
-    cv::gpu::GpuMat d_frame(frame);
-    cv::gpu::FGDStatModel d_model(out_cn);
-    d_model.create(d_frame);
-
-    cv::Mat h_background;
-    cv::Mat h_foreground;
-    cv::Mat h_background3;
-
-    cv::Mat backgroundDiff;
-    cv::Mat foregroundDiff;
-
-    for (int i = 0; i < 5; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        ipl_frame = frame;
-        int gold_count = cvUpdateBGStatModel(&ipl_frame, model);
-
-        d_frame.upload(frame);
-
-        int count = d_model.update(d_frame);
-
-        ASSERT_EQ(gold_count, count);
-
-        cv::Mat gold_background(model->background);
-        cv::Mat gold_foreground(model->foreground);
-
-        if (out_cn == 3)
-            d_model.background.download(h_background3);
-        else
-        {
-            d_model.background.download(h_background);
-            cv::cvtColor(h_background, h_background3, cv::COLOR_BGRA2BGR);
-        }
-        d_model.foreground.download(h_foreground);
-
-        ASSERT_MAT_NEAR(gold_background, h_background3, 1.0);
-        ASSERT_MAT_NEAR(gold_foreground, h_foreground, 0.0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, FGDStatModel, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi")),
-    testing::Values(Channels(3), Channels(4))));
-
-//////////////////////////////////////////////////////
-// MOG
-
-IMPLEMENT_PARAM_CLASS(LearningRate, double)
-
-PARAM_TEST_CASE(MOG, cv::gpu::DeviceInfo, std::string, UseGray, LearningRate, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    std::string inputFile;
-    bool useGray;
-    double learningRate;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
-
-        useGray = GET_PARAM(2);
-
-        learningRate = GET_PARAM(3);
-
-        useRoi = GET_PARAM(4);
-    }
-};
-
-TEST_P(MOG, Update)
-{
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    cv::gpu::MOG_GPU mog;
-    cv::gpu::GpuMat foreground = createMat(frame.size(), CV_8UC1, useRoi);
-
-    cv::BackgroundSubtractorMOG mog_gold;
-    cv::Mat foreground_gold;
+    cv::gpu::GpuMat frame;
 
     for (int i = 0; i < 10; ++i)
     {
-        cap >> frame;
+        ASSERT_TRUE(reader.read(frame));
         ASSERT_FALSE(frame.empty());
-
-        if (useGray)
-        {
-            cv::Mat temp;
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            cv::swap(temp, frame);
-        }
-
-        mog(loadMat(frame, useRoi), foreground, (float)learningRate);
-
-        mog_gold(frame, foreground_gold, learningRate);
-
-        ASSERT_MAT_NEAR(foreground_gold, foreground, 0.0);
     }
+
+    reader.close();
+    ASSERT_FALSE(reader.isOpened());
 }
 
-INSTANTIATE_TEST_CASE_P(GPU_Video, MOG, testing::Combine(
+INSTANTIATE_TEST_CASE_P(GPU_Video, VideoReader, testing::Combine(
     ALL_DEVICES,
-    testing::Values(std::string("768x576.avi")),
-    testing::Values(UseGray(true), UseGray(false)),
-    testing::Values(LearningRate(0.0), LearningRate(0.01)),
-    WHOLE_SUBMAT));
-
-//////////////////////////////////////////////////////
-// MOG2
-
-PARAM_TEST_CASE(MOG2, cv::gpu::DeviceInfo, std::string, UseGray, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    std::string inputFile;
-    bool useGray;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
-
-        useGray = GET_PARAM(2);
-
-        useRoi = GET_PARAM(3);
-    }
-};
-
-TEST_P(MOG2, Update)
-{
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    cv::gpu::MOG2_GPU mog2;
-    cv::gpu::GpuMat foreground = createMat(frame.size(), CV_8UC1, useRoi);
-
-    cv::BackgroundSubtractorMOG2 mog2_gold;
-    cv::Mat foreground_gold;
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (useGray)
-        {
-            cv::Mat temp;
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            cv::swap(temp, frame);
-        }
-
-        mog2(loadMat(frame, useRoi), foreground);
-
-        mog2_gold(frame, foreground_gold);
-
-        double norm = cv::norm(foreground_gold, cv::Mat(foreground), cv::NORM_L1);
-
-        norm /= foreground_gold.size().area();
-
-        ASSERT_LE(norm, 0.09);
-    }
-}
-
-TEST_P(MOG2, getBackgroundImage)
-{
-    if (useGray)
-        return;
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-
-    cv::gpu::MOG2_GPU mog2;
-    cv::gpu::GpuMat foreground;
-
-    cv::BackgroundSubtractorMOG2 mog2_gold;
-    cv::Mat foreground_gold;
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        mog2(loadMat(frame, useRoi), foreground);
-
-        mog2_gold(frame, foreground_gold);
-    }
-
-    cv::gpu::GpuMat background = createMat(frame.size(), frame.type(), useRoi);
-    mog2.getBackgroundImage(background);
-
-    cv::Mat background_gold;
-    mog2_gold.getBackgroundImage(background_gold);
-
-    ASSERT_MAT_NEAR(background_gold, background, 0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, MOG2, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi")),
-    testing::Values(UseGray(true), UseGray(false)),
-    WHOLE_SUBMAT));
-
-//////////////////////////////////////////////////////
-// VIBE
-
-PARAM_TEST_CASE(VIBE, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
-{
-};
-
-TEST_P(VIBE, Accuracy)
-{
-    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-    const cv::Size size = GET_PARAM(1);
-    const int type = GET_PARAM(2);
-    const bool useRoi = GET_PARAM(3);
-
-    const cv::Mat fullfg(size, CV_8UC1, cv::Scalar::all(255));
-
-    cv::Mat frame = randomMat(size, type, 0.0, 100);
-    cv::gpu::GpuMat d_frame = loadMat(frame, useRoi);
-
-    cv::gpu::VIBE_GPU vibe;
-    cv::gpu::GpuMat d_fgmask = createMat(size, CV_8UC1, useRoi);
-    vibe.initialize(d_frame);
-
-    for (int i = 0; i < 20; ++i)
-        vibe(d_frame, d_fgmask);
-
-    frame = randomMat(size, type, 160, 255);
-    d_frame = loadMat(frame, useRoi);
-    vibe(d_frame, d_fgmask);
-
-    // now fgmask should be entirely foreground
-    ASSERT_MAT_NEAR(fullfg, d_fgmask, 0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, VIBE, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4)),
-    WHOLE_SUBMAT));
-
-//////////////////////////////////////////////////////
-// GMG
-
-PARAM_TEST_CASE(GMG, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, UseRoi)
-{
-};
-
-TEST_P(GMG, Accuracy)
-{
-    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-    const cv::Size size = GET_PARAM(1);
-    const int depth = GET_PARAM(2);
-    const int channels = GET_PARAM(3);
-    const bool useRoi = GET_PARAM(4);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    const cv::Mat zeros(size, CV_8UC1, cv::Scalar::all(0));
-    const cv::Mat fullfg(size, CV_8UC1, cv::Scalar::all(255));
-
-    cv::Mat frame = randomMat(size, type, 0, 100);
-    cv::gpu::GpuMat d_frame = loadMat(frame, useRoi);
-
-    cv::gpu::GMG_GPU gmg;
-    gmg.numInitializationFrames = 5;
-    gmg.smoothingRadius = 0;
-    gmg.initialize(d_frame.size(), 0, 255);
-
-    cv::gpu::GpuMat d_fgmask = createMat(size, CV_8UC1, useRoi);
-
-    for (int i = 0; i < gmg.numInitializationFrames; ++i)
-    {
-        gmg(d_frame, d_fgmask);
-
-        // fgmask should be entirely background during training
-        ASSERT_MAT_NEAR(zeros, d_fgmask, 0);
-    }
-
-    frame = randomMat(size, type, 160, 255);
-    d_frame = loadMat(frame, useRoi);
-    gmg(d_frame, d_fgmask);
-
-    // now fgmask should be entirely foreground
-    ASSERT_MAT_NEAR(fullfg, d_fgmask, 0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, GMG, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8U), MatType(CV_16U), MatType(CV_32F)),
-    testing::Values(Channels(1), Channels(3), Channels(4)),
-    WHOLE_SUBMAT));
+    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
 
 //////////////////////////////////////////////////////
 // VideoWriter
@@ -785,8 +93,6 @@ PARAM_TEST_CASE(VideoWriter, cv::gpu::DeviceInfo, std::string)
     cv::gpu::DeviceInfo devInfo;
     std::string inputFile;
 
-    std::string outputFile;
-
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
@@ -794,17 +100,17 @@ PARAM_TEST_CASE(VideoWriter, cv::gpu::DeviceInfo, std::string)
 
         cv::gpu::setDevice(devInfo.deviceID());
 
-        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + inputFile;
-        outputFile = cv::tempfile(".avi");
+        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + std::string("video/") + inputFile;
     }
 };
 
-TEST_P(VideoWriter, Regression)
+GPU_TEST_P(VideoWriter, Regression)
 {
+    std::string outputFile = cv::tempfile(".avi");
     const double FPS = 25.0;
 
     cv::VideoCapture reader(inputFile);
-    ASSERT_TRUE( reader.isOpened() );
+    ASSERT_TRUE(reader.isOpened());
 
     cv::gpu::VideoWriter_GPU d_writer;
 
@@ -828,12 +134,12 @@ TEST_P(VideoWriter, Regression)
     d_writer.close();
 
     reader.open(outputFile);
-    ASSERT_TRUE( reader.isOpened() );
+    ASSERT_TRUE(reader.isOpened());
 
     for (int i = 0; i < 5; ++i)
     {
         reader >> frame;
-        ASSERT_FALSE( frame.empty() );
+        ASSERT_FALSE(frame.empty());
     }
 }
 
@@ -843,44 +149,4 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, VideoWriter, testing::Combine(
 
 #endif // WIN32
 
-//////////////////////////////////////////////////////
-// VideoReader
-
-PARAM_TEST_CASE(VideoReader, cv::gpu::DeviceInfo, std::string)
-{
-    cv::gpu::DeviceInfo devInfo;
-    std::string inputFile;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        inputFile = GET_PARAM(1);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + inputFile;
-    }
-};
-
-TEST_P(VideoReader, Regression)
-{
-    cv::gpu::VideoReader_GPU reader(inputFile);
-    ASSERT_TRUE( reader.isOpened() );
-
-    cv::gpu::GpuMat frame;
-
-    for (int i = 0; i < 10; ++i)
-    {
-        ASSERT_TRUE( reader.read(frame) );
-        ASSERT_FALSE( frame.empty() );
-    }
-
-    reader.close();
-    ASSERT_FALSE( reader.isOpened() );
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, VideoReader, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
-
-#endif // HAVE_CUDA
+#endif //  defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
diff --git a/modules/gpu/test/test_warp_affine.cpp b/modules/gpu/test/test_warp_affine.cpp
index 065c6755ef..de8bc5d795 100644
--- a/modules/gpu/test/test_warp_affine.cpp
+++ b/modules/gpu/test/test_warp_affine.cpp
@@ -48,6 +48,7 @@ namespace
     cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
     {
         cv::Mat M(2, 3, CV_64FC1);
+
         M.at<double>(0, 0) = std::cos(angle); M.at<double>(0, 1) = -std::sin(angle); M.at<double>(0, 2) = srcSize.width / 2;
         M.at<double>(1, 0) = std::sin(angle); M.at<double>(1, 1) =  std::cos(angle); M.at<double>(1, 2) = 0.0;
 
@@ -74,22 +75,23 @@ PARAM_TEST_CASE(BuildWarpAffineMaps, cv::gpu::DeviceInfo, cv::Size, Inverse)
     }
 };
 
-TEST_P(BuildWarpAffineMaps, Accuracy)
+GPU_TEST_P(BuildWarpAffineMaps, Accuracy)
 {
     cv::Mat M = createTransfomMatrix(size, CV_PI / 4);
+    cv::Mat src = randomMat(randomSize(200, 400), CV_8UC1);
+
     cv::gpu::GpuMat xmap, ymap;
     cv::gpu::buildWarpAffineMaps(M, inverse, size, xmap, ymap);
 
     int interpolation = cv::INTER_NEAREST;
     int borderMode = cv::BORDER_CONSTANT;
-
-    cv::Mat src = randomMat(randomSize(200, 400), CV_8UC1);
-    cv::Mat dst;
-    cv::remap(src, dst, cv::Mat(xmap), cv::Mat(ymap), interpolation, borderMode);
-
     int flags = interpolation;
     if (inverse)
         flags |= cv::WARP_INVERSE_MAP;
+
+    cv::Mat dst;
+    cv::remap(src, dst, cv::Mat(xmap), cv::Mat(ymap), interpolation, borderMode);
+
     cv::Mat dst_gold;
     cv::warpAffine(src, dst_gold, M, size, flags, borderMode);
 
@@ -199,7 +201,7 @@ PARAM_TEST_CASE(WarpAffine, cv::gpu::DeviceInfo, cv::Size, MatType, Inverse, Int
     }
 };
 
-TEST_P(WarpAffine, Accuracy)
+GPU_TEST_P(WarpAffine, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
     cv::Mat M = createTransfomMatrix(size, CV_PI / 3);
@@ -247,7 +249,7 @@ PARAM_TEST_CASE(WarpAffineNPP, cv::gpu::DeviceInfo, MatType, Inverse, Interpolat
     }
 };
 
-TEST_P(WarpAffineNPP, Accuracy)
+GPU_TEST_P(WarpAffineNPP, Accuracy)
 {
     cv::Mat src = readImageType("stereobp/aloe-L.png", type);
     cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
diff --git a/modules/gpu/test/test_warp_perspective.cpp b/modules/gpu/test/test_warp_perspective.cpp
index 95c089bb93..534edc0d2b 100644
--- a/modules/gpu/test/test_warp_perspective.cpp
+++ b/modules/gpu/test/test_warp_perspective.cpp
@@ -48,6 +48,7 @@ namespace
     cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
     {
         cv::Mat M(3, 3, CV_64FC1);
+
         M.at<double>(0, 0) = std::cos(angle); M.at<double>(0, 1) = -std::sin(angle); M.at<double>(0, 2) = srcSize.width / 2;
         M.at<double>(1, 0) = std::sin(angle); M.at<double>(1, 1) =  std::cos(angle); M.at<double>(1, 2) = 0.0;
         M.at<double>(2, 0) = 0.0            ; M.at<double>(2, 1) =  0.0            ; M.at<double>(2, 2) = 1.0;
@@ -75,21 +76,25 @@ PARAM_TEST_CASE(BuildWarpPerspectiveMaps, cv::gpu::DeviceInfo, cv::Size, Inverse
     }
 };
 
-TEST_P(BuildWarpPerspectiveMaps, Accuracy)
+GPU_TEST_P(BuildWarpPerspectiveMaps, Accuracy)
 {
     cv::Mat M = createTransfomMatrix(size, CV_PI / 4);
+
     cv::gpu::GpuMat xmap, ymap;
     cv::gpu::buildWarpPerspectiveMaps(M, inverse, size, xmap, ymap);
 
     cv::Mat src = randomMat(randomSize(200, 400), CV_8UC1);
-    cv::Mat dst;
-    cv::remap(src, dst, cv::Mat(xmap), cv::Mat(ymap), cv::INTER_NEAREST, cv::BORDER_CONSTANT);
-
-    int flags = cv::INTER_NEAREST;
+    int interpolation = cv::INTER_NEAREST;
+    int borderMode = cv::BORDER_CONSTANT;
+    int flags = interpolation;
     if (inverse)
         flags |= cv::WARP_INVERSE_MAP;
+
+    cv::Mat dst;
+    cv::remap(src, dst, cv::Mat(xmap), cv::Mat(ymap), interpolation, borderMode);
+
     cv::Mat dst_gold;
-    cv::warpPerspective(src, dst_gold, M, size, flags, cv::BORDER_CONSTANT);
+    cv::warpPerspective(src, dst_gold, M, size, flags, borderMode);
 
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
@@ -199,7 +204,7 @@ PARAM_TEST_CASE(WarpPerspective, cv::gpu::DeviceInfo, cv::Size, MatType, Inverse
     }
 };
 
-TEST_P(WarpPerspective, Accuracy)
+GPU_TEST_P(WarpPerspective, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
     cv::Mat M = createTransfomMatrix(size, CV_PI / 3);
@@ -247,7 +252,7 @@ PARAM_TEST_CASE(WarpPerspectiveNPP, cv::gpu::DeviceInfo, MatType, Inverse, Inter
     }
 };
 
-TEST_P(WarpPerspectiveNPP, Accuracy)
+GPU_TEST_P(WarpPerspectiveNPP, Accuracy)
 {
     cv::Mat src = readImageType("stereobp/aloe-L.png", type);
     cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
diff --git a/modules/gpu/test/utility.cpp b/modules/gpu/test/utility.cpp
index 2a4cebc8d0..88f7963242 100644
--- a/modules/gpu/test/utility.cpp
+++ b/modules/gpu/test/utility.cpp
@@ -67,7 +67,7 @@ double randomDouble(double minVal, double maxVal)
 
 Size randomSize(int minVal, int maxVal)
 {
-    return cv::Size(randomInt(minVal, maxVal), randomInt(minVal, maxVal));
+    return Size(randomInt(minVal, maxVal), randomInt(minVal, maxVal));
 }
 
 Scalar randomScalar(double minVal, double maxVal)
@@ -83,7 +83,7 @@ Mat randomMat(Size size, int type, double minVal, double maxVal)
 //////////////////////////////////////////////////////////////////////
 // GpuMat create
 
-cv::gpu::GpuMat createMat(cv::Size size, int type, bool useRoi)
+GpuMat createMat(Size size, int type, bool useRoi)
 {
     Size size0 = size;
 
@@ -122,21 +122,13 @@ Mat readImageType(const std::string& fname, int type)
     if (CV_MAT_CN(type) == 4)
     {
         Mat temp;
-        cvtColor(src, temp, cv::COLOR_BGR2BGRA);
+        cvtColor(src, temp, COLOR_BGR2BGRA);
         swap(src, temp);
     }
     src.convertTo(src, CV_MAT_DEPTH(type), CV_MAT_DEPTH(type) == CV_32F ? 1.0 / 255.0 : 1.0);
     return src;
 }
 
-//////////////////////////////////////////////////////////////////////
-// Image dumping
-
-void dumpImage(const std::string& fileName, const cv::Mat& image)
-{
-    cv::imwrite(TS::ptr()->get_data_path() + fileName, image);
-}
-
 //////////////////////////////////////////////////////////////////////
 // Gpu devices
 
@@ -156,7 +148,7 @@ void DeviceManager::load(int i)
     devices_.clear();
     devices_.reserve(1);
 
-    ostringstream msg;
+    std::ostringstream msg;
 
     if (i < 0 || i >= getCudaEnabledDeviceCount())
     {
@@ -195,21 +187,39 @@ void DeviceManager::loadAll()
 //////////////////////////////////////////////////////////////////////
 // Additional assertion
 
-Mat getMat(InputArray arr)
+namespace
 {
-    if (arr.kind() == _InputArray::GPU_MAT)
+    template <typename T, typename OutT> std::string printMatValImpl(const Mat& m, Point p)
     {
-        Mat m;
-        arr.getGpuMat().download(m);
-        return m;
+        const int cn = m.channels();
+
+        std::ostringstream ostr;
+        ostr << "(";
+
+        p.x /= cn;
+
+        ostr << static_cast<OutT>(m.at<T>(p.y, p.x * cn));
+        for (int c = 1; c < m.channels(); ++c)
+        {
+            ostr << ", " << static_cast<OutT>(m.at<T>(p.y, p.x * cn + c));
+        }
+        ostr << ")";
+
+        return ostr.str();
     }
 
-    return arr.getMat();
-}
+    std::string printMatVal(const Mat& m, Point p)
+    {
+        typedef std::string (*func_t)(const Mat& m, Point p);
 
-double checkNorm(InputArray m1, InputArray m2)
-{
-    return norm(getMat(m1), getMat(m2), NORM_INF);
+        static const func_t funcs[] =
+        {
+            printMatValImpl<uchar, int>, printMatValImpl<schar, int>, printMatValImpl<ushort, int>, printMatValImpl<short, int>,
+            printMatValImpl<int, int>, printMatValImpl<float, float>, printMatValImpl<double, double>
+        };
+
+        return funcs[m.depth()](m, p);
+    }
 }
 
 void minMaxLocGold(const Mat& src, double* minVal_, double* maxVal_, Point* minLoc_, Point* maxLoc_, const Mat& mask)
@@ -229,8 +239,8 @@ void minMaxLocGold(const Mat& src, double* minVal_, double* maxVal_, Point* minL
 
     for (int y = 0; y < src.rows; ++y)
     {
-        const schar* src_row = src.ptr<signed char>(y);
-        const uchar* mask_row = mask.empty() ? 0 : mask.ptr<unsigned char>(y);
+        const schar* src_row = src.ptr<schar>(y);
+        const uchar* mask_row = mask.empty() ? 0 : mask.ptr<uchar>(y);
 
         for (int x = 0; x < src.cols; ++x)
         {
@@ -260,42 +270,19 @@ void minMaxLocGold(const Mat& src, double* minVal_, double* maxVal_, Point* minL
     if (maxLoc_) *maxLoc_ = maxLoc;
 }
 
-namespace
+Mat getMat(InputArray arr)
 {
-    template <typename T, typename OutT> std::string printMatValImpl(const Mat& m, Point p)
+    if (arr.kind() == _InputArray::GPU_MAT)
     {
-        const int cn = m.channels();
-
-        ostringstream ostr;
-        ostr << "(";
-
-        p.x /= cn;
-
-        ostr << static_cast<OutT>(m.at<T>(p.y, p.x * cn));
-        for (int c = 1; c < m.channels(); ++c)
-        {
-            ostr << ", " << static_cast<OutT>(m.at<T>(p.y, p.x * cn + c));
-        }
-        ostr << ")";
-
-        return ostr.str();
+        Mat m;
+        arr.getGpuMat().download(m);
+        return m;
     }
 
-    std::string printMatVal(const Mat& m, Point p)
-    {
-        typedef std::string (*func_t)(const Mat& m, Point p);
-
-        static const func_t funcs[] =
-        {
-            printMatValImpl<uchar, int>, printMatValImpl<schar, int>, printMatValImpl<ushort, int>, printMatValImpl<short, int>,
-            printMatValImpl<int, int>, printMatValImpl<float, float>, printMatValImpl<double, double>
-        };
-
-        return funcs[m.depth()](m, p);
-    }
+    return arr.getMat();
 }
 
-testing::AssertionResult assertMatNear(const char* expr1, const char* expr2, const char* eps_expr, cv::InputArray m1_, cv::InputArray m2_, double eps)
+AssertionResult assertMatNear(const char* expr1, const char* expr2, const char* eps_expr, InputArray m1_, InputArray m2_, double eps)
 {
     Mat m1 = getMat(m1_);
     Mat m2 = getMat(m2_);
@@ -344,18 +331,6 @@ double checkSimilarity(InputArray m1, InputArray m2)
 //////////////////////////////////////////////////////////////////////
 // Helper structs for value-parameterized tests
 
-vector<MatDepth> depths(int depth_start, int depth_end)
-{
-    vector<MatDepth> v;
-
-    v.reserve((depth_end - depth_start + 1));
-
-    for (int depth = depth_start; depth <= depth_end; ++depth)
-        v.push_back(depth);
-
-    return v;
-}
-
 vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end)
 {
     vector<MatType> v;
@@ -366,7 +341,7 @@ vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end)
     {
         for (int cn = cn_start; cn <= cn_end; ++cn)
         {
-            v.push_back(CV_MAKETYPE(depth, cn));
+            v.push_back(MatType(CV_MAKE_TYPE(depth, cn)));
         }
     }
 
@@ -401,6 +376,14 @@ void PrintTo(const Inverse& inverse, std::ostream* os)
         (*os) << "direct";
 }
 
+//////////////////////////////////////////////////////////////////////
+// Other
+
+void dumpImage(const std::string& fileName, const Mat& image)
+{
+    imwrite(TS::ptr()->get_data_path() + fileName, image);
+}
+
 void showDiff(InputArray gold_, InputArray actual_, double eps)
 {
     Mat gold = getMat(gold_);
diff --git a/modules/gpu/test/utility.hpp b/modules/gpu/test/utility.hpp
index a32aabea59..674e9a17ee 100644
--- a/modules/gpu/test/utility.hpp
+++ b/modules/gpu/test/utility.hpp
@@ -39,8 +39,14 @@
 //
 //M*/
 
-#ifndef __OPENCV_TEST_UTILITY_HPP__
-#define __OPENCV_TEST_UTILITY_HPP__
+#ifndef __OPENCV_GPU_TEST_UTILITY_HPP__
+#define __OPENCV_GPU_TEST_UTILITY_HPP__
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/ts/ts.hpp"
+#include "opencv2/ts/ts_perf.hpp"
 
 //////////////////////////////////////////////////////////////////////
 // random generators
@@ -66,11 +72,6 @@ cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
 //! read image from testdata folder and convert it to specified type
 cv::Mat readImageType(const std::string& fname, int type);
 
-//////////////////////////////////////////////////////////////////////
-// Image dumping
-
-void dumpImage(const std::string& fileName, const cv::Mat& image);
-
 //////////////////////////////////////////////////////////////////////
 // Gpu devices
 
@@ -96,12 +97,10 @@ private:
 //////////////////////////////////////////////////////////////////////
 // Additional assertion
 
-cv::Mat getMat(cv::InputArray arr);
-
-double checkNorm(cv::InputArray m1, cv::InputArray m2);
-
 void minMaxLocGold(const cv::Mat& src, double* minVal_, double* maxVal_ = 0, cv::Point* minLoc_ = 0, cv::Point* maxLoc_ = 0, const cv::Mat& mask = cv::Mat());
 
+cv::Mat getMat(cv::InputArray arr);
+
 testing::AssertionResult assertMatNear(const char* expr1, const char* expr2, const char* eps_expr, cv::InputArray m1, cv::InputArray m2, double eps);
 
 #define EXPECT_MAT_NEAR(m1, m2, eps) EXPECT_PRED_FORMAT3(assertMatNear, m1, m2, eps)
@@ -164,6 +163,45 @@ double checkSimilarity(cv::InputArray m1, cv::InputArray m2);
 //////////////////////////////////////////////////////////////////////
 // Helper structs for value-parameterized tests
 
+#define GPU_TEST_P(test_case_name, test_name) \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+      : public test_case_name { \
+   public: \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
+    virtual void TestBody(); \
+   private: \
+    void UnsafeTestBody(); \
+    static int AddToRegistry() { \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, __FILE__, __LINE__)->AddTestPattern(\
+                  #test_case_name, \
+                  #test_name, \
+                  new ::testing::internal::TestMetaFactory< \
+                      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
+      return 0; \
+    } \
+    static int gtest_registering_dummy_; \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
+  }; \
+  int GTEST_TEST_CLASS_NAME_(test_case_name, \
+                             test_name)::gtest_registering_dummy_ = \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
+  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() \
+  { \
+    try \
+    { \
+      UnsafeTestBody(); \
+    } \
+    catch (...) \
+    { \
+      cv::gpu::resetDevice(); \
+      throw; \
+    } \
+  } \
+  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::UnsafeTestBody()
+
 #define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
 #define GET_PARAM(k) std::tr1::get< k >(GetParam())
 
@@ -178,11 +216,8 @@ namespace cv { namespace gpu
 
 using perf::MatDepth;
 
-//! return vector with depths from specified range.
-std::vector<MatDepth> depths(int depth_start, int depth_end);
-
 #define ALL_DEPTH testing::Values(MatDepth(CV_8U), MatDepth(CV_8S), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32S), MatDepth(CV_32F), MatDepth(CV_64F))
-#define DEPTHS(depth_start, depth_end) testing::ValuesIn(depths(depth_start, depth_end))
+
 #define DEPTH_PAIRS testing::Values(std::make_pair(MatDepth(CV_8U), MatDepth(CV_8U)),   \
                                     std::make_pair(MatDepth(CV_8U), MatDepth(CV_16U)),  \
                                     std::make_pair(MatDepth(CV_8U), MatDepth(CV_16S)),  \
@@ -237,8 +272,6 @@ private:
 
 void PrintTo(const UseRoi& useRoi, std::ostream* os);
 
-#define WHOLE testing::Values(UseRoi(false))
-#define SUBMAT testing::Values(UseRoi(true))
 #define WHOLE_SUBMAT testing::Values(UseRoi(false), UseRoi(true))
 
 // Direct/Inverse
@@ -253,7 +286,9 @@ public:
 private:
     bool val_;
 };
+
 void PrintTo(const Inverse& useRoi, std::ostream* os);
+
 #define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true))
 
 // Param class
@@ -291,6 +326,7 @@ CV_FLAGS(WarpFlags, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::WA
 //////////////////////////////////////////////////////////////////////
 // Other
 
+void dumpImage(const std::string& fileName, const cv::Mat& image);
 void showDiff(cv::InputArray gold, cv::InputArray actual, double eps);
 
-#endif // __OPENCV_TEST_UTILITY_HPP__
+#endif // __OPENCV_GPU_TEST_UTILITY_HPP__
diff --git a/modules/highgui/include/opencv2/highgui/highgui.hpp b/modules/highgui/include/opencv2/highgui/highgui.hpp
index 57aef6314d..43cf13dc13 100644
--- a/modules/highgui/include/opencv2/highgui/highgui.hpp
+++ b/modules/highgui/include/opencv2/highgui/highgui.hpp
@@ -125,16 +125,13 @@ CV_EXPORTS_W void setTrackbarPos(const string& trackbarname, const string& winna
 
 // OpenGL support
 
-typedef void (*OpenGlDrawCallback)(void* userdata);
+typedef void (CV_CDECL *OpenGlDrawCallback)(void* userdata);
 CV_EXPORTS void setOpenGlDrawCallback(const string& winname, OpenGlDrawCallback onOpenGlDraw, void* userdata = 0);
 
 CV_EXPORTS void setOpenGlContext(const string& winname);
 
 CV_EXPORTS void updateWindow(const string& winname);
 
-CV_EXPORTS void pointCloudShow(const string& winname, const GlCamera& camera, const GlArrays& arr);
-CV_EXPORTS void pointCloudShow(const string& winname, const GlCamera& camera, InputArray points, InputArray colors = noArray());
-
 //Only for Qt
 
 CV_EXPORTS CvFont fontQt(const string& nameFont, int pointSize=-1,
diff --git a/modules/highgui/src/precomp.hpp b/modules/highgui/src/precomp.hpp
index 744689b683..b683fd6ca8 100644
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@@ -205,9 +205,6 @@ void cvSetRatioWindow_QT(const char* name,double prop_value);
 double cvGetOpenGlProp_QT(const char* name);
 #endif
 
-// OpenGL
-typedef void (CV_CDECL *CvOpenGlCleanCallback)(void* userdata);
-void icvSetOpenGlCleanCallback(const char* window_name, CvOpenGlCleanCallback callback, void* userdata);
 
 
 /*namespace cv
diff --git a/modules/highgui/src/window.cpp b/modules/highgui/src/window.cpp
index e6b3b1f7ab..05cb5ef2e4 100644
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -40,6 +40,7 @@
 //M*/
 
 #include "precomp.hpp"
+#include <map>
 #include "opencv2/core/opengl_interop.hpp"
 
 // in later times, use this file as a dispatcher to implementations like cvcap.cpp
@@ -240,94 +241,15 @@ void cv::updateWindow(const string& windowName)
 #ifdef HAVE_OPENGL
 namespace
 {
-    const int CV_TEXTURE_MAGIC_VAL        = 0x00287653;
-    const int CV_POINT_CLOUD_MAGIC_VAL    = 0x00287654;
-
-    struct GlObjBase
-    {
-        int flag;
-        GlObjBase* next;
-        GlObjBase* prev;
-        std::string winname;
-
-        virtual ~GlObjBase() {}
-    };
-
-    GlObjBase* g_glObjs = 0;
-
-    GlObjBase* findGlObjByName(const std::string& winname)
-    {
-        GlObjBase* obj = g_glObjs;
-
-        while(obj && obj->winname != winname)
-            obj = obj->next;
-
-        return obj;
-    }
-
-    void addGlObj(GlObjBase* glObj)
-    {
-        glObj->next = g_glObjs;
-        glObj->prev = 0;
-        if (g_glObjs)
-            g_glObjs->prev = glObj;
-        g_glObjs = glObj;
-    }
-
-    void removeGlObj(GlObjBase* glObj)
-    {
-        if (glObj->prev)
-            glObj->prev->next = glObj->next;
-        else
-            g_glObjs = glObj->next;
-
-        if (glObj->next)
-            glObj->next->prev = glObj->prev;
-
-        delete glObj;
-    }
-
-    struct GlObjTex : GlObjBase
-    {
-        cv::GlTexture tex;
-    };
+    std::map<std::string, cv::GlTexture2D> wndTexs;
+    std::map<std::string, cv::GlTexture2D> ownWndTexs;
+    std::map<std::string, cv::GlBuffer> ownWndBufs;
 
     void CV_CDECL glDrawTextureCallback(void* userdata)
     {
-        GlObjTex* texObj = static_cast<GlObjTex*>(userdata);
+        cv::GlTexture2D* texObj = static_cast<cv::GlTexture2D*>(userdata);
 
-        CV_DbgAssert(texObj->flag == CV_TEXTURE_MAGIC_VAL);
-
-        static cv::GlCamera glCamera;
-
-        glCamera.setupProjectionMatrix();
-
-        cv::render(texObj->tex);
-    }
-
-    struct GlObjPointCloud : GlObjBase
-    {
-        cv::GlArrays arr;
-        cv::GlCamera camera;
-    };
-
-    void CV_CDECL glDrawPointCloudCallback(void* userdata)
-    {
-        GlObjPointCloud* pointCloudObj = static_cast<GlObjPointCloud*>(userdata);
-
-        CV_DbgAssert(pointCloudObj->flag == CV_POINT_CLOUD_MAGIC_VAL);
-
-        pointCloudObj->camera.setupProjectionMatrix();
-        pointCloudObj->camera.setupModelViewMatrix();
-
-        cv::render(pointCloudObj->arr);
-    }
-
-    void CV_CDECL glCleanCallback(void* userdata)
-    {
-        GlObjBase* glObj = static_cast<GlObjBase*>(userdata);
-
-        removeGlObj(glObj);
+        cv::render(*texObj);
     }
 }
 #endif // HAVE_OPENGL
@@ -339,7 +261,8 @@ void cv::imshow( const string& winname, InputArray _img )
     CvMat c_img = img;
     cvShowImage(winname.c_str(), &c_img);
 #else
-    double useGl = getWindowProperty(winname, WND_PROP_OPENGL);
+    const double useGl = getWindowProperty(winname, WND_PROP_OPENGL);
+
     if (useGl <= 0)
     {
         Mat img = _img.getMat();
@@ -348,7 +271,7 @@ void cv::imshow( const string& winname, InputArray _img )
     }
     else
     {
-        double autoSize = getWindowProperty(winname, WND_PROP_AUTOSIZE);
+        const double autoSize = getWindowProperty(winname, WND_PROP_AUTOSIZE);
 
         if (autoSize > 0)
         {
@@ -358,145 +281,44 @@ void cv::imshow( const string& winname, InputArray _img )
 
         setOpenGlContext(winname);
 
-        GlObjBase* glObj = findGlObjByName(winname);
-
-        if (glObj && glObj->flag != CV_TEXTURE_MAGIC_VAL)
+        if (_img.kind() == _InputArray::OPENGL_TEXTURE2D)
         {
-            icvSetOpenGlCleanCallback(winname.c_str(), 0, 0);
-            glObj = 0;
-        }
+            cv::GlTexture2D& tex = wndTexs[winname];
 
-        if (glObj)
-        {
-            GlObjTex* texObj = static_cast<GlObjTex*>(glObj);
-            texObj->tex.copyFrom(_img);
+            tex = _img.getGlTexture2D();
+
+            tex.setAutoRelease(false);
+
+            setOpenGlDrawCallback(winname, glDrawTextureCallback, &tex);
         }
         else
         {
-            GlObjTex* texObj = new GlObjTex;
-            texObj->tex.copyFrom(_img);
+            cv::GlTexture2D& tex = ownWndTexs[winname];
 
-            glObj = texObj;
-            glObj->flag = CV_TEXTURE_MAGIC_VAL;
-            glObj->winname = winname;
+            if (_img.kind() == _InputArray::GPU_MAT)
+            {
+                cv::GlBuffer& buf = ownWndBufs[winname];
+                buf.copyFrom(_img);
+                buf.setAutoRelease(false);
 
-            addGlObj(glObj);
+                tex.copyFrom(buf);
+                tex.setAutoRelease(false);
+            }
+            else
+            {
+                tex.copyFrom(_img);
+            }
 
-            icvSetOpenGlCleanCallback(winname.c_str(), glCleanCallback, glObj);
+            tex.setAutoRelease(false);
+
+            setOpenGlDrawCallback(winname, glDrawTextureCallback, &tex);
         }
 
-        setOpenGlDrawCallback(winname, glDrawTextureCallback, glObj);
-
         updateWindow(winname);
     }
 #endif
 }
 
-void cv::pointCloudShow(const string& winname, const GlCamera& camera, const GlArrays& arr)
-{
-#ifndef HAVE_OPENGL
-    CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support");
-    (void)winname;
-    (void)camera;
-    (void)arr;
-#else
-    namedWindow(winname, WINDOW_OPENGL);
-
-    setOpenGlContext(winname);
-
-    GlObjBase* glObj = findGlObjByName(winname);
-
-    if (glObj && glObj->flag != CV_POINT_CLOUD_MAGIC_VAL)
-    {
-        icvSetOpenGlCleanCallback(winname.c_str(), 0, 0);
-        glObj = 0;
-    }
-
-    if (glObj)
-    {
-        GlObjPointCloud* pointCloudObj = static_cast<GlObjPointCloud*>(glObj);
-        pointCloudObj->arr = arr;
-        pointCloudObj->camera = camera;
-    }
-    else
-    {
-        GlObjPointCloud* pointCloudObj = new GlObjPointCloud;
-        pointCloudObj->arr = arr;
-        pointCloudObj->camera = camera;
-
-        glObj = pointCloudObj;
-        glObj->flag = CV_POINT_CLOUD_MAGIC_VAL;
-        glObj->winname = winname;
-
-        addGlObj(glObj);
-
-        icvSetOpenGlCleanCallback(winname.c_str(), glCleanCallback, glObj);
-    }
-
-    setOpenGlDrawCallback(winname, glDrawPointCloudCallback, glObj);
-
-    updateWindow(winname);
-#endif
-}
-
-void cv::pointCloudShow(const std::string& winname, const cv::GlCamera& camera, InputArray points, InputArray colors)
-{
-#ifndef HAVE_OPENGL
-    (void)winname;
-    (void)camera;
-    (void)points;
-    (void)colors;
-    CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support");
-#else
-    namedWindow(winname, WINDOW_OPENGL);
-
-    setOpenGlContext(winname);
-
-    GlObjBase* glObj = findGlObjByName(winname);
-
-    if (glObj && glObj->flag != CV_POINT_CLOUD_MAGIC_VAL)
-    {
-        icvSetOpenGlCleanCallback(winname.c_str(), 0, 0);
-        glObj = 0;
-    }
-
-    if (glObj)
-    {
-        GlObjPointCloud* pointCloudObj = static_cast<GlObjPointCloud*>(glObj);
-
-        pointCloudObj->arr.setVertexArray(points);
-        if (colors.empty())
-            pointCloudObj->arr.resetColorArray();
-        else
-            pointCloudObj->arr.setColorArray(colors);
-
-        pointCloudObj->camera = camera;
-    }
-    else
-    {
-        GlObjPointCloud* pointCloudObj = new GlObjPointCloud;
-
-        pointCloudObj->arr.setVertexArray(points);
-        if (!colors.empty())
-            pointCloudObj->arr.setColorArray(colors);
-
-        pointCloudObj->camera = camera;
-
-        glObj = pointCloudObj;
-        glObj->flag = CV_POINT_CLOUD_MAGIC_VAL;
-        glObj->winname = winname;
-
-        addGlObj(glObj);
-
-        icvSetOpenGlCleanCallback(winname.c_str(), glCleanCallback, glObj);
-    }
-
-    setOpenGlDrawCallback(winname, glDrawPointCloudCallback, glObj);
-
-    updateWindow(winname);
-#endif
-}
-
 // Without OpenGL
 
 #ifndef HAVE_OPENGL
@@ -516,11 +338,6 @@ CV_IMPL void cvUpdateWindow(const char*)
     CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support");
 }
 
-void icvSetOpenGlCleanCallback(const char*, CvOpenGlCleanCallback, void*)
-{
-    CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support");
-}
-
 #endif // !HAVE_OPENGL
 
 #if defined (HAVE_QT)
diff --git a/modules/highgui/src/window_QT.cpp b/modules/highgui/src/window_QT.cpp
index d617e71f9f..2514c2b0ce 100644
--- a/modules/highgui/src/window_QT.cpp
+++ b/modules/highgui/src/window_QT.cpp
@@ -685,20 +685,6 @@ CV_IMPL void cvSetOpenGlDrawCallback(const char* window_name, CvOpenGlDrawCallba
 }
 
 
-void icvSetOpenGlCleanCallback(const char* window_name, CvOpenGlCleanCallback callback, void* userdata)
-{
-    if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
-
-    QMetaObject::invokeMethod(guiMainThread,
-        "setOpenGlCleanCallback",
-        Qt::AutoConnection,
-        Q_ARG(QString, QString(window_name)),
-        Q_ARG(void*, (void*)callback),
-        Q_ARG(void*, userdata));
-}
-
-
 CV_IMPL void cvSetOpenGlContext(const char* window_name)
 {
     if (!guiMainThread)
@@ -1175,14 +1161,6 @@ void GuiReceiver::setOpenGlDrawCallback(QString name, void* callback, void* user
         w->setOpenGlDrawCallback((CvOpenGlDrawCallback) callback, userdata);
 }
 
-void GuiReceiver::setOpenGlCleanCallback(QString name, void* callback, void* userdata)
-{
-    QPointer<CvWindow> w = icvFindWindowByName(name);
-
-    if (w)
-        w->setOpenGlCleanCallback((CvOpenGlCleanCallback) callback, userdata);
-}
-
 void GuiReceiver::setOpenGlContext(QString name)
 {
     QPointer<CvWindow> w = icvFindWindowByName(name);
@@ -1828,12 +1806,6 @@ void CvWindow::setOpenGlDrawCallback(CvOpenGlDrawCallback callback, void* userda
 }
 
 
-void CvWindow::setOpenGlCleanCallback(CvOpenGlCleanCallback callback, void* userdata)
-{
-    myView->setOpenGlCleanCallback(callback, userdata);
-}
-
-
 void CvWindow::makeCurrentOpenGlContext()
 {
     myView->makeCurrentOpenGlContext();
@@ -2420,12 +2392,6 @@ void DefaultViewPort::setOpenGlDrawCallback(CvOpenGlDrawCallback /*callback*/, v
 }
 
 
-void DefaultViewPort::setOpenGlCleanCallback(CvOpenGlCleanCallback /*callback*/, void* /*userdata*/)
-{
-    CV_Error(CV_OpenGlNotSupported, "Window doesn't support OpenGL");
-}
-
-
 void DefaultViewPort::makeCurrentOpenGlContext()
 {
     CV_Error(CV_OpenGlNotSupported, "Window doesn't support OpenGL");
@@ -3074,19 +3040,10 @@ OpenGlViewPort::OpenGlViewPort(QWidget* _parent) : QGLWidget(_parent), size(-1,
 
     glDrawCallback = 0;
     glDrawData = 0;
-
-    glCleanCallback = 0;
-    glCleanData = 0;
-
-    glFuncTab = 0;
 }
 
 OpenGlViewPort::~OpenGlViewPort()
 {
-    if (glFuncTab)
-        delete glFuncTab;
-
-    setOpenGlCleanCallback(0, 0);
 }
 
 QWidget* OpenGlViewPort::getWidget()
@@ -3131,21 +3088,9 @@ void OpenGlViewPort::setOpenGlDrawCallback(CvOpenGlDrawCallback callback, void*
     glDrawData = userdata;
 }
 
-void OpenGlViewPort::setOpenGlCleanCallback(CvOpenGlCleanCallback callback, void* userdata)
-{
-    makeCurrentOpenGlContext();
-
-    if (glCleanCallback)
-        glCleanCallback(glCleanData);
-
-    glCleanCallback = callback;
-    glCleanData = userdata;
-}
-
 void OpenGlViewPort::makeCurrentOpenGlContext()
 {
     makeCurrent();
-    icvSetOpenGlFuncTab(glFuncTab);
 }
 
 void OpenGlViewPort::updateGl()
@@ -3153,255 +3098,9 @@ void OpenGlViewPort::updateGl()
     QGLWidget::updateGL();
 }
 
-#ifndef APIENTRY
-    #define APIENTRY
-#endif
-
-#ifndef APIENTRYP
-    #define APIENTRYP APIENTRY *
-#endif
-
-#ifndef GL_VERSION_1_5
-    /* GL types for handling large vertex buffer objects */
-    typedef ptrdiff_t GLintptr;
-    typedef ptrdiff_t GLsizeiptr;
-#endif
-
-typedef void (APIENTRYP PFNGLGENBUFFERSPROC   ) (GLsizei n, GLuint *buffers);
-typedef void (APIENTRYP PFNGLDELETEBUFFERSPROC) (GLsizei n, const GLuint *buffers);
-
-typedef void (APIENTRYP PFNGLBUFFERDATAPROC   ) (GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage);
-typedef void (APIENTRYP PFNGLBUFFERSUBDATAPROC) (GLenum target, GLintptr offset, GLsizeiptr size, const GLvoid* data);
-
-typedef void (APIENTRYP PFNGLBINDBUFFERPROC   ) (GLenum target, GLuint buffer);
-
-typedef GLvoid* (APIENTRYP PFNGLMAPBUFFERPROC) (GLenum target, GLenum access);
-typedef GLboolean (APIENTRYP PFNGLUNMAPBUFFERPROC) (GLenum target);
-
-class GlFuncTab_QT : public CvOpenGlFuncTab
-{
-public:
-#ifdef Q_WS_WIN
-    GlFuncTab_QT(HDC hDC);
-#else
-    GlFuncTab_QT();
-#endif
-
-    void genBuffers(int n, unsigned int* buffers) const;
-    void deleteBuffers(int n, const unsigned int* buffers) const;
-
-    void bufferData(unsigned int target, ptrdiff_t size, const void* data, unsigned int usage) const;
-    void bufferSubData(unsigned int target, ptrdiff_t offset, ptrdiff_t size, const void* data) const;
-
-    void bindBuffer(unsigned int target, unsigned int buffer) const;
-
-    void* mapBuffer(unsigned int target, unsigned int access) const;
-    void unmapBuffer(unsigned int target) const;
-
-    void generateBitmapFont(const std::string& family, int height, int weight, bool italic, bool underline, int start, int count, int base) const;
-
-    bool isGlContextInitialized() const;
-
-    PFNGLGENBUFFERSPROC    glGenBuffersExt;
-    PFNGLDELETEBUFFERSPROC glDeleteBuffersExt;
-
-    PFNGLBUFFERDATAPROC    glBufferDataExt;
-    PFNGLBUFFERSUBDATAPROC glBufferSubDataExt;
-
-    PFNGLBINDBUFFERPROC    glBindBufferExt;
-
-    PFNGLMAPBUFFERPROC     glMapBufferExt;
-    PFNGLUNMAPBUFFERPROC   glUnmapBufferExt;
-
-    bool initialized;
-
-#ifdef Q_WS_WIN
-    HDC hDC;
-#endif
-};
-
-#ifdef Q_WS_WIN
-    GlFuncTab_QT::GlFuncTab_QT(HDC hDC_) : hDC(hDC_)
-#else
-    GlFuncTab_QT::GlFuncTab_QT()
-#endif
-{
-    glGenBuffersExt    = 0;
-    glDeleteBuffersExt = 0;
-
-    glBufferDataExt    = 0;
-    glBufferSubDataExt = 0;
-
-    glBindBufferExt    = 0;
-
-    glMapBufferExt     = 0;
-    glUnmapBufferExt   = 0;
-
-    initialized = false;
-}
-
-void GlFuncTab_QT::genBuffers(int n, unsigned int* buffers) const
-{
-    CV_FUNCNAME( "GlFuncTab_QT::genBuffers" );
-
-    __BEGIN__;
-
-    if (!glGenBuffersExt)
-        CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-    glGenBuffersExt(n, buffers);
-    CV_CheckGlError();
-
-    __END__;
-}
-
-void GlFuncTab_QT::deleteBuffers(int n, const unsigned int* buffers) const
-{
-    CV_FUNCNAME( "GlFuncTab_QT::deleteBuffers" );
-
-    __BEGIN__;
-
-    if (!glDeleteBuffersExt)
-        CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-    glDeleteBuffersExt(n, buffers);
-    CV_CheckGlError();
-
-    __END__;
-}
-
-void GlFuncTab_QT::bufferData(unsigned int target, ptrdiff_t size, const void* data, unsigned int usage) const
-{
-    CV_FUNCNAME( "GlFuncTab_QT::bufferData" );
-
-    __BEGIN__;
-
-    if (!glBufferDataExt)
-        CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-    glBufferDataExt(target, size, data, usage);
-    CV_CheckGlError();
-
-    __END__;
-}
-
-void GlFuncTab_QT::bufferSubData(unsigned int target, ptrdiff_t offset, ptrdiff_t size, const void* data) const
-{
-    CV_FUNCNAME( "GlFuncTab_QT::bufferSubData" );
-
-    __BEGIN__;
-
-    if (!glBufferSubDataExt)
-        CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-    glBufferSubDataExt(target, offset, size, data);
-    CV_CheckGlError();
-
-    __END__;
-}
-
-void GlFuncTab_QT::bindBuffer(unsigned int target, unsigned int buffer) const
-{
-    CV_FUNCNAME( "GlFuncTab_QT::bindBuffer" );
-
-    __BEGIN__;
-
-    if (!glBindBufferExt)
-        CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-    glBindBufferExt(target, buffer);
-    CV_CheckGlError();
-
-    __END__;
-}
-
-void* GlFuncTab_QT::mapBuffer(unsigned int target, unsigned int access) const
-{
-    CV_FUNCNAME( "GlFuncTab_QT::mapBuffer" );
-
-    void* res = 0;
-
-    __BEGIN__;
-
-    if (!glMapBufferExt)
-        CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-    res = glMapBufferExt(target, access);
-    CV_CheckGlError();
-
-    __END__;
-
-    return res;
-}
-
-void GlFuncTab_QT::unmapBuffer(unsigned int target) const
-{
-    CV_FUNCNAME( "GlFuncTab_QT::unmapBuffer" );
-
-    __BEGIN__;
-
-    if (!glUnmapBufferExt)
-        CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-    glUnmapBufferExt(target);
-    CV_CheckGlError();
-
-    __END__;
-}
-
-void GlFuncTab_QT::generateBitmapFont(const std::string& family, int height, int weight, bool italic, bool /*underline*/, int start, int count, int base) const
-{
-#ifdef Q_WS_WIN
-    CV_FUNCNAME( "GlFuncTab_QT::generateBitmapFont" );
-#endif
-
-    QFont font(QString(family.c_str()), height, weight, italic);
-
-    __BEGIN__;
-
-#ifndef Q_WS_WIN
-    font.setStyleStrategy(QFont::OpenGLCompatible);
-    if (font.handle())
-        glXUseXFont(font.handle(), start, count, base);
-#else
-    SelectObject(hDC, font.handle());
-    if (!wglUseFontBitmaps(hDC, start, count, base))
-        CV_ERROR(CV_OpenGlApiCallError, "Can't create font");
-#endif
-
-    __END__;
-}
-
-bool GlFuncTab_QT::isGlContextInitialized() const
-{
-    return initialized;
-}
-
 void OpenGlViewPort::initializeGL()
 {
     glHint(GL_PERSPECTIVE_CORRECTION_HINT, GL_NICEST);
-
-#ifdef Q_WS_WIN
-    std::auto_ptr<GlFuncTab_QT> qglFuncTab(new GlFuncTab_QT(getDC()));
-#else
-    std::auto_ptr<GlFuncTab_QT> qglFuncTab(new GlFuncTab_QT);
-#endif
-
-    // Load extensions
-
-    qglFuncTab->glGenBuffersExt = (PFNGLGENBUFFERSPROC)context()->getProcAddress("glGenBuffers");
-    qglFuncTab->glDeleteBuffersExt = (PFNGLDELETEBUFFERSPROC)context()->getProcAddress("glDeleteBuffers");
-    qglFuncTab->glBufferDataExt = (PFNGLBUFFERDATAPROC)context()->getProcAddress("glBufferData");
-    qglFuncTab->glBufferSubDataExt = (PFNGLBUFFERSUBDATAPROC)context()->getProcAddress("glBufferSubData");
-    qglFuncTab->glBindBufferExt = (PFNGLBINDBUFFERPROC)context()->getProcAddress("glBindBuffer");
-    qglFuncTab->glMapBufferExt = (PFNGLMAPBUFFERPROC)context()->getProcAddress("glMapBuffer");
-    qglFuncTab->glUnmapBufferExt = (PFNGLUNMAPBUFFERPROC)context()->getProcAddress("glUnmapBuffer");
-
-    qglFuncTab->initialized = true;
-
-    glFuncTab = qglFuncTab.release();
-
-    icvSetOpenGlFuncTab(glFuncTab);
 }
 
 void OpenGlViewPort::resizeGL(int w, int h)
@@ -3411,14 +3110,10 @@ void OpenGlViewPort::resizeGL(int w, int h)
 
 void OpenGlViewPort::paintGL()
 {
-    icvSetOpenGlFuncTab(glFuncTab);
-
     glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
 
     if (glDrawCallback)
         glDrawCallback(glDrawData);
-
-    CV_CheckGlError();
 }
 
 void OpenGlViewPort::mousePressEvent(QMouseEvent* evnt)
diff --git a/modules/highgui/src/window_QT.h b/modules/highgui/src/window_QT.h
index 64faee0f41..1b07442dc9 100644
--- a/modules/highgui/src/window_QT.h
+++ b/modules/highgui/src/window_QT.h
@@ -141,7 +141,6 @@ public slots:
     void enablePropertiesButtonEachWindow();
 
     void setOpenGlDrawCallback(QString name, void* callback, void* userdata);
-    void setOpenGlCleanCallback(QString name, void* callback, void* userdata);
     void setOpenGlContext(QString name);
     void updateWindow(QString name);
     double isOpenGl(QString name);
@@ -312,7 +311,6 @@ public:
     static void addSlider2(CvWindow* w, QString name, int* value, int count, CvTrackbarCallback2 on_change CV_DEFAULT(NULL), void* userdata CV_DEFAULT(0));
 
     void setOpenGlDrawCallback(CvOpenGlDrawCallback callback, void* userdata);
-    void setOpenGlCleanCallback(CvOpenGlCleanCallback callback, void* userdata);
     void makeCurrentOpenGlContext();
     void updateGl();
     bool isOpenGl();
@@ -397,7 +395,6 @@ public:
     virtual void startDisplayInfo(QString text, int delayms) = 0;
 
     virtual void setOpenGlDrawCallback(CvOpenGlDrawCallback callback, void* userdata) = 0;
-    virtual void setOpenGlCleanCallback(CvOpenGlCleanCallback callback, void* userdata) = 0;
     virtual void makeCurrentOpenGlContext() = 0;
     virtual void updateGl() = 0;
 
@@ -429,7 +426,6 @@ public:
     void startDisplayInfo(QString text, int delayms);
 
     void setOpenGlDrawCallback(CvOpenGlDrawCallback callback, void* userdata);
-    void setOpenGlCleanCallback(CvOpenGlCleanCallback callback, void* userdata);
     void makeCurrentOpenGlContext();
     void updateGl();
 
@@ -456,11 +452,6 @@ private:
     CvOpenGlDrawCallback glDrawCallback;
     void* glDrawData;
 
-    CvOpenGlCleanCallback glCleanCallback;
-    void* glCleanData;
-
-    CvOpenGlFuncTab* glFuncTab;
-
     void icvmouseHandler(QMouseEvent* event, type_mouse_event category, int& cv_event, int& flags);
     void icvmouseProcessing(QPointF pt, int cv_event, int flags);
 };
@@ -491,7 +482,6 @@ public:
     void startDisplayInfo(QString text, int delayms);
 
     void setOpenGlDrawCallback(CvOpenGlDrawCallback callback, void* userdata);
-    void setOpenGlCleanCallback(CvOpenGlCleanCallback callback, void* userdata);
     void makeCurrentOpenGlContext();
     void updateGl();
 
diff --git a/modules/highgui/src/window_gtk.cpp b/modules/highgui/src/window_gtk.cpp
index 99b16d93c1..3ee3d0c480 100644
--- a/modules/highgui/src/window_gtk.cpp
+++ b/modules/highgui/src/window_gtk.cpp
@@ -417,9 +417,6 @@ typedef struct CvWindow
 
     CvOpenGlDrawCallback glDrawCallback;
     void* glDrawData;
-
-    CvOpenGlCleanCallback glCleanCallback;
-    void* glCleanData;
 #endif
 }
 CvWindow;
@@ -692,238 +689,6 @@ double cvGetOpenGlProp_GTK(const char* name)
 
 namespace
 {
-    class GlFuncTab_GTK : public CvOpenGlFuncTab
-    {
-    public:
-        GlFuncTab_GTK();
-
-        void genBuffers(int n, unsigned int* buffers) const;
-        void deleteBuffers(int n, const unsigned int* buffers) const;
-
-        void bufferData(unsigned int target, ptrdiff_t size, const void* data, unsigned int usage) const;
-        void bufferSubData(unsigned int target, ptrdiff_t offset, ptrdiff_t size, const void* data) const;
-
-        void bindBuffer(unsigned int target, unsigned int buffer) const;
-
-        void* mapBuffer(unsigned int target, unsigned int access) const;
-        void unmapBuffer(unsigned int target) const;
-
-        void generateBitmapFont(const std::string& family, int height, int weight, bool italic, bool underline, int start, int count, int base) const;
-
-        bool isGlContextInitialized() const;
-
-        PFNGLGENBUFFERSPROC    glGenBuffersExt;
-        PFNGLDELETEBUFFERSPROC glDeleteBuffersExt;
-
-        PFNGLBUFFERDATAPROC    glBufferDataExt;
-        PFNGLBUFFERSUBDATAPROC glBufferSubDataExt;
-
-        PFNGLBINDBUFFERPROC    glBindBufferExt;
-
-        PFNGLMAPBUFFERPROC     glMapBufferExt;
-        PFNGLUNMAPBUFFERPROC   glUnmapBufferExt;
-
-        bool initialized;
-    };
-
-    GlFuncTab_GTK::GlFuncTab_GTK()
-    {
-        glGenBuffersExt    = 0;
-        glDeleteBuffersExt = 0;
-
-        glBufferDataExt    = 0;
-        glBufferSubDataExt = 0;
-
-        glBindBufferExt    = 0;
-
-        glMapBufferExt     = 0;
-        glUnmapBufferExt   = 0;
-
-        initialized = false;
-    }
-
-    void GlFuncTab_GTK::genBuffers(int n, unsigned int* buffers) const
-    {
-        CV_FUNCNAME( "GlFuncTab_GTK::genBuffers" );
-
-        __BEGIN__;
-
-        if (!glGenBuffersExt)
-            CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-        glGenBuffersExt(n, buffers);
-        CV_CheckGlError();
-
-        __END__;
-    }
-
-    void GlFuncTab_GTK::deleteBuffers(int n, const unsigned int* buffers) const
-    {
-        CV_FUNCNAME( "GlFuncTab_GTK::deleteBuffers" );
-
-        __BEGIN__;
-
-        if (!glDeleteBuffersExt)
-            CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-        glDeleteBuffersExt(n, buffers);
-        CV_CheckGlError();
-
-        __END__;
-    }
-
-    void GlFuncTab_GTK::bufferData(unsigned int target, ptrdiff_t size, const void* data, unsigned int usage) const
-    {
-        CV_FUNCNAME( "GlFuncTab_GTK::bufferData" );
-
-        __BEGIN__;
-
-        if (!glBufferDataExt)
-            CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-        glBufferDataExt(target, size, data, usage);
-        CV_CheckGlError();
-
-        __END__;
-    }
-
-    void GlFuncTab_GTK::bufferSubData(unsigned int target, ptrdiff_t offset, ptrdiff_t size, const void* data) const
-    {
-        CV_FUNCNAME( "GlFuncTab_GTK::bufferSubData" );
-
-        __BEGIN__;
-
-        if (!glBufferSubDataExt)
-            CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-        glBufferSubDataExt(target, offset, size, data);
-        CV_CheckGlError();
-
-        __END__;
-    }
-
-    void GlFuncTab_GTK::bindBuffer(unsigned int target, unsigned int buffer) const
-    {
-        CV_FUNCNAME( "GlFuncTab_GTK::bindBuffer" );
-
-        __BEGIN__;
-
-        if (!glBindBufferExt)
-            CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-        glBindBufferExt(target, buffer);
-        CV_CheckGlError();
-
-        __END__;
-    }
-
-    void* GlFuncTab_GTK::mapBuffer(unsigned int target, unsigned int access) const
-    {
-        CV_FUNCNAME( "GlFuncTab_GTK::mapBuffer" );
-
-        void* res = 0;
-
-        __BEGIN__;
-
-        if (!glMapBufferExt)
-            CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-        res = glMapBufferExt(target, access);
-        CV_CheckGlError();
-
-        __END__;
-
-        return res;
-    }
-
-    void GlFuncTab_GTK::unmapBuffer(unsigned int target) const
-    {
-        CV_FUNCNAME( "GlFuncTab_GTK::unmapBuffer" );
-
-        __BEGIN__;
-
-        if (!glUnmapBufferExt)
-            CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-        glUnmapBufferExt(target);
-        CV_CheckGlError();
-
-        __END__;
-    }
-
-    void GlFuncTab_GTK::generateBitmapFont(const std::string& family, int height, int weight, bool italic, bool /*underline*/, int start, int count, int base) const
-    {
-        PangoFontDescription* fontDecr;
-        PangoFont* pangoFont;
-
-        CV_FUNCNAME( "GlFuncTab_GTK::generateBitmapFont" );
-
-        __BEGIN__;
-
-        fontDecr = pango_font_description_new();
-
-        pango_font_description_set_size(fontDecr, height);
-
-        pango_font_description_set_family_static(fontDecr, family.c_str());
-
-        pango_font_description_set_weight(fontDecr, static_cast<PangoWeight>(weight));
-
-        pango_font_description_set_style(fontDecr, italic ? PANGO_STYLE_ITALIC : PANGO_STYLE_NORMAL);
-
-        pangoFont = gdk_gl_font_use_pango_font(fontDecr, start, count, base);
-
-        pango_font_description_free(fontDecr);
-
-        if (!pangoFont)
-            CV_ERROR(CV_OpenGlApiCallError, "Can't create font");
-
-        __END__;
-    }
-
-    bool GlFuncTab_GTK::isGlContextInitialized() const
-    {
-        return initialized;
-    }
-
-    void initGl()
-    {
-        static GlFuncTab_GTK glFuncTab;
-        static bool first = true;
-
-        if (first)
-        {
-            // Load extensions
-            GdkGLProc func;
-
-            func = gdk_gl_get_proc_address("glGenBuffers");
-            glFuncTab.glGenBuffersExt = (PFNGLGENBUFFERSPROC)func;
-
-            func = gdk_gl_get_proc_address("glDeleteBuffers");
-            glFuncTab.glDeleteBuffersExt = (PFNGLDELETEBUFFERSPROC)func;
-
-            func = gdk_gl_get_proc_address("glBufferData");
-            glFuncTab.glBufferDataExt = (PFNGLBUFFERDATAPROC)func;
-
-            func = gdk_gl_get_proc_address("glBufferSubData");
-            glFuncTab.glBufferSubDataExt = (PFNGLBUFFERSUBDATAPROC)func;
-
-            func = gdk_gl_get_proc_address("glBindBuffer");
-            glFuncTab.glBindBufferExt = (PFNGLBINDBUFFERPROC)func;
-
-            func = gdk_gl_get_proc_address("glMapBuffer");
-            glFuncTab.glMapBufferExt = (PFNGLMAPBUFFERPROC)func;
-
-            func = gdk_gl_get_proc_address("glUnmapBuffer");
-            glFuncTab.glUnmapBufferExt = (PFNGLUNMAPBUFFERPROC)func;
-
-            glFuncTab.initialized = true;
-
-            icvSetOpenGlFuncTab(&glFuncTab);
-
-            first = false;
-        }
-    }
-
     void createGlContext(CvWindow* window)
     {
         GdkGLConfig* glconfig;
@@ -932,8 +697,6 @@ namespace
 
         __BEGIN__;
 
-        window->useGl = false;
-
         // Try double-buffered visual
         glconfig = gdk_gl_config_new_by_mode((GdkGLConfigMode)(GDK_GL_MODE_RGB | GDK_GL_MODE_DEPTH | GDK_GL_MODE_DOUBLE));
         if (!glconfig)
@@ -943,24 +706,11 @@ namespace
         if (!gtk_widget_set_gl_capability(window->widget, glconfig, NULL, TRUE, GDK_GL_RGBA_TYPE))
             CV_ERROR( CV_OpenGlApiCallError, "Can't Create A GL Device Context" );
 
-        initGl();
-
         window->useGl = true;
 
         __END__;
     }
 
-    void releaseGlContext(CvWindow* window)
-    {
-        //CV_FUNCNAME( "releaseGlContext" );
-
-        //__BEGIN__;
-
-        window->useGl = false;
-
-        //__END__;
-    }
-
     void drawGl(CvWindow* window)
     {
         CV_FUNCNAME( "drawGl" );
@@ -980,8 +730,6 @@ namespace
         if (window->glDrawCallback)
             window->glDrawCallback(window->glDrawData);
 
-        CV_CheckGlError();
-
         if (gdk_gl_drawable_is_double_buffered (gldrawable))
             gdk_gl_drawable_swap_buffers(gldrawable);
         else
@@ -1101,9 +849,6 @@ CV_IMPL int cvNamedWindow( const char* name, int flags )
 
     window->glDrawCallback = 0;
     window->glDrawData = 0;
-
-    window->glCleanCallback = 0;
-    window->glCleanData = 0;
 #endif
 
     //
@@ -1233,40 +978,6 @@ CV_IMPL void cvSetOpenGlDrawCallback(const char* name, CvOpenGlDrawCallback call
     __END__;
 }
 
-void icvSetOpenGlCleanCallback(const char* name, CvOpenGlCleanCallback callback, void* userdata)
-{
-    CvWindow* window;
-    GdkGLContext* glcontext;
-    GdkGLDrawable* gldrawable;
-
-    CV_FUNCNAME( "icvSetOpenGlCleanCallback" );
-
-    __BEGIN__;
-
-    if (!name)
-        CV_ERROR(CV_StsNullPtr, "NULL name string");
-
-    window = icvFindWindowByName(name);
-    if (!window)
-        EXIT;
-
-    if (!window->useGl)
-        CV_ERROR( CV_OpenGlNotSupported, "Window doesn't support OpenGL" );
-
-    glcontext = gtk_widget_get_gl_context(window->widget);
-    gldrawable = gtk_widget_get_gl_drawable(window->widget);
-
-    gdk_gl_drawable_make_current(gldrawable, glcontext);
-
-    if (window->glCleanCallback)
-        window->glCleanCallback(window->glCleanData);
-
-    window->glCleanCallback = callback;
-    window->glCleanData = userdata;
-
-    __END__;
-}
-
 #endif // HAVE_OPENGL
 
 
@@ -1276,25 +987,6 @@ static void icvDeleteWindow( CvWindow* window )
 {
     CvTrackbar* trackbar;
 
-#ifdef HAVE_OPENGL
-    if (window->useGl)
-    {
-        GdkGLContext* glcontext = gtk_widget_get_gl_context(window->widget);
-        GdkGLDrawable* gldrawable = gtk_widget_get_gl_drawable(window->widget);
-
-        gdk_gl_drawable_make_current(gldrawable, glcontext);
-
-        if (window->glCleanCallback)
-        {
-            window->glCleanCallback(window->glCleanData);
-            window->glCleanCallback = 0;
-            window->glCleanData = 0;
-        }
-
-        releaseGlContext(window);
-    }
-#endif
-
     if( window->prev )
         window->prev->next = window->next;
     else
diff --git a/modules/highgui/src/window_w32.cpp b/modules/highgui/src/window_w32.cpp
index 348420afdb..094819740d 100644
--- a/modules/highgui/src/window_w32.cpp
+++ b/modules/highgui/src/window_w32.cpp
@@ -192,11 +192,6 @@ typedef struct CvWindow
 
     CvOpenGlDrawCallback glDrawCallback;
     void* glDrawData;
-
-    CvOpenGlCleanCallback glCleanCallback;
-    void* glCleanData;
-
-    CvOpenGlFuncTab* glFuncTab;
 #endif
 }
 CvWindow;
@@ -580,272 +575,8 @@ double cvGetOpenGlProp_W32(const char* name)
 
 #ifdef HAVE_OPENGL
 
-#ifndef APIENTRY
-    #define APIENTRY
-#endif
-
-#ifndef APIENTRYP
-    #define APIENTRYP APIENTRY *
-#endif
-
-#ifndef GL_VERSION_1_5
-    /* GL types for handling large vertex buffer objects */
-    typedef ptrdiff_t GLintptr;
-    typedef ptrdiff_t GLsizeiptr;
-#endif
-
 namespace
 {
-    typedef void (APIENTRYP PFNGLGENBUFFERSPROC   ) (GLsizei n, GLuint *buffers);
-    typedef void (APIENTRYP PFNGLDELETEBUFFERSPROC) (GLsizei n, const GLuint *buffers);
-
-    typedef void (APIENTRYP PFNGLBUFFERDATAPROC   ) (GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage);
-    typedef void (APIENTRYP PFNGLBUFFERSUBDATAPROC) (GLenum target, GLintptr offset, GLsizeiptr size, const GLvoid* data);
-
-    typedef void (APIENTRYP PFNGLBINDBUFFERPROC   ) (GLenum target, GLuint buffer);
-
-    typedef GLvoid* (APIENTRYP PFNGLMAPBUFFERPROC) (GLenum target, GLenum access);
-    typedef GLboolean (APIENTRYP PFNGLUNMAPBUFFERPROC) (GLenum target);
-
-    class GlFuncTab_W32 : public CvOpenGlFuncTab
-    {
-    public:
-        GlFuncTab_W32(HDC hDC);
-
-        void genBuffers(int n, unsigned int* buffers) const;
-        void deleteBuffers(int n, const unsigned int* buffers) const;
-
-        void bufferData(unsigned int target, ptrdiff_t size, const void* data, unsigned int usage) const;
-        void bufferSubData(unsigned int target, ptrdiff_t offset, ptrdiff_t size, const void* data) const;
-
-        void bindBuffer(unsigned int target, unsigned int buffer) const;
-
-        void* mapBuffer(unsigned int target, unsigned int access) const;
-        void unmapBuffer(unsigned int target) const;
-
-        void generateBitmapFont(const std::string& family, int height, int weight, bool italic, bool underline, int start, int count, int base) const;
-
-        bool isGlContextInitialized() const;
-
-        PFNGLGENBUFFERSPROC    glGenBuffersExt;
-        PFNGLDELETEBUFFERSPROC glDeleteBuffersExt;
-
-        PFNGLBUFFERDATAPROC    glBufferDataExt;
-        PFNGLBUFFERSUBDATAPROC glBufferSubDataExt;
-
-        PFNGLBINDBUFFERPROC    glBindBufferExt;
-
-        PFNGLMAPBUFFERPROC     glMapBufferExt;
-        PFNGLUNMAPBUFFERPROC   glUnmapBufferExt;
-
-        bool initialized;
-
-        HDC hDC;
-    };
-
-    GlFuncTab_W32::GlFuncTab_W32(HDC hDC_)
-    {
-        glGenBuffersExt    = 0;
-        glDeleteBuffersExt = 0;
-
-        glBufferDataExt    = 0;
-        glBufferSubDataExt = 0;
-
-        glBindBufferExt    = 0;
-
-        glMapBufferExt     = 0;
-        glUnmapBufferExt   = 0;
-
-        initialized = false;
-
-        hDC = hDC_;
-    }
-
-    void GlFuncTab_W32::genBuffers(int n, unsigned int* buffers) const
-    {
-        CV_FUNCNAME( "GlFuncTab_W32::genBuffers" );
-
-        __BEGIN__;
-
-        if (!glGenBuffersExt)
-            CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-        glGenBuffersExt(n, buffers);
-        CV_CheckGlError();
-
-        __END__;
-    }
-
-    void GlFuncTab_W32::deleteBuffers(int n, const unsigned int* buffers) const
-    {
-        CV_FUNCNAME( "GlFuncTab_W32::deleteBuffers" );
-
-        __BEGIN__;
-
-        if (!glDeleteBuffersExt)
-            CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-        glDeleteBuffersExt(n, buffers);
-        CV_CheckGlError();
-
-        __END__;
-    }
-
-    void GlFuncTab_W32::bufferData(unsigned int target, ptrdiff_t size, const void* data, unsigned int usage) const
-    {
-        CV_FUNCNAME( "GlFuncTab_W32::bufferData" );
-
-        __BEGIN__;
-
-        if (!glBufferDataExt)
-            CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-        glBufferDataExt(target, size, data, usage);
-        CV_CheckGlError();
-
-        __END__;
-    }
-
-    void GlFuncTab_W32::bufferSubData(unsigned int target, ptrdiff_t offset, ptrdiff_t size, const void* data) const
-    {
-        CV_FUNCNAME( "GlFuncTab_W32::bufferSubData" );
-
-        __BEGIN__;
-
-        if (!glBufferSubDataExt)
-            CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-        glBufferSubDataExt(target, offset, size, data);
-        CV_CheckGlError();
-
-        __END__;
-    }
-
-    void GlFuncTab_W32::bindBuffer(unsigned int target, unsigned int buffer) const
-    {
-        CV_FUNCNAME( "GlFuncTab_W32::bindBuffer" );
-
-        __BEGIN__;
-
-        if (!glBindBufferExt)
-            CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-        glBindBufferExt(target, buffer);
-        CV_CheckGlError();
-
-        __END__;
-    }
-
-    void* GlFuncTab_W32::mapBuffer(unsigned int target, unsigned int access) const
-    {
-        CV_FUNCNAME( "GlFuncTab_W32::mapBuffer" );
-
-        void* res = 0;
-
-        __BEGIN__;
-
-        if (!glMapBufferExt)
-            CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-        res = glMapBufferExt(target, access);
-        CV_CheckGlError();
-
-        __END__;
-
-        return res;
-    }
-
-    void GlFuncTab_W32::unmapBuffer(unsigned int target) const
-    {
-        CV_FUNCNAME( "GlFuncTab_W32::unmapBuffer" );
-
-        __BEGIN__;
-
-        if (!glUnmapBufferExt)
-            CV_ERROR(CV_OpenGlApiCallError, "Current OpenGL implementation doesn't support required extension");
-
-        glUnmapBufferExt(target);
-        CV_CheckGlError();
-
-        __END__;
-    }
-
-    void GlFuncTab_W32::generateBitmapFont(const std::string& family, int height, int weight, bool italic, bool underline, int start, int count, int base) const
-    {
-        HFONT font;
-
-        CV_FUNCNAME( "GlFuncTab_W32::generateBitmapFont" );
-
-        __BEGIN__;
-
-        font = CreateFont
-        (
-            -height,                     // height
-            0,                           // cell width
-            0,                           // Angle of Escapement
-            0,                           // Orientation Angle
-            weight,                      // font weight
-            italic ? TRUE : FALSE,       // Italic
-            underline ? TRUE : FALSE,    // Underline
-            FALSE,                       // StrikeOut
-            ANSI_CHARSET,                // CharSet
-            OUT_TT_PRECIS,               // OutPrecision
-            CLIP_DEFAULT_PRECIS,         // ClipPrecision
-            ANTIALIASED_QUALITY,         // Quality
-            FF_DONTCARE | DEFAULT_PITCH, // PitchAndFamily
-            family.c_str()               // FaceName
-        );
-
-        SelectObject(hDC, font);
-
-        if (!wglUseFontBitmaps(hDC, start, count, base))
-            CV_ERROR(CV_OpenGlApiCallError, "Can't create font");
-
-        __END__;
-    }
-
-    bool GlFuncTab_W32::isGlContextInitialized() const
-    {
-        return initialized;
-    }
-
-    void initGl(CvWindow* window)
-    {
-        glHint(GL_PERSPECTIVE_CORRECTION_HINT, GL_NICEST);
-
-        std::auto_ptr<GlFuncTab_W32> glFuncTab(new GlFuncTab_W32(window->dc));
-
-        // Load extensions
-        PROC func;
-
-        func = wglGetProcAddress("glGenBuffers");
-        glFuncTab->glGenBuffersExt = (PFNGLGENBUFFERSPROC)func;
-
-        func = wglGetProcAddress("glDeleteBuffers");
-        glFuncTab->glDeleteBuffersExt = (PFNGLDELETEBUFFERSPROC)func;
-
-        func = wglGetProcAddress("glBufferData");
-        glFuncTab->glBufferDataExt = (PFNGLBUFFERDATAPROC)func;
-
-        func = wglGetProcAddress("glBufferSubData");
-        glFuncTab->glBufferSubDataExt = (PFNGLBUFFERSUBDATAPROC)func;
-
-        func = wglGetProcAddress("glBindBuffer");
-        glFuncTab->glBindBufferExt = (PFNGLBINDBUFFERPROC)func;
-
-        func = wglGetProcAddress("glMapBuffer");
-        glFuncTab->glMapBufferExt = (PFNGLMAPBUFFERPROC)func;
-
-        func = wglGetProcAddress("glUnmapBuffer");
-        glFuncTab->glUnmapBufferExt = (PFNGLUNMAPBUFFERPROC)func;
-
-        glFuncTab->initialized = true;
-
-        window->glFuncTab = glFuncTab.release();
-
-        icvSetOpenGlFuncTab(window->glFuncTab);
-    }
-
     void createGlContext(HWND hWnd, HDC& hGLDC, HGLRC& hGLRC, bool& useGl)
     {
         CV_FUNCNAME( "createGlContext" );
@@ -907,8 +638,6 @@ namespace
 
         __BEGIN__;
 
-        delete window->glFuncTab;
-
         if (window->hGLRC)
         {
             wglDeleteContext(window->hGLRC);
@@ -940,8 +669,6 @@ namespace
         if (window->glDrawCallback)
             window->glDrawCallback(window->glDrawData);
 
-        CV_CheckGlError();
-
         if (!SwapBuffers(window->dc))
             CV_ERROR( CV_OpenGlApiCallError, "Can't swap OpenGL buffers" );
 
@@ -1042,7 +769,6 @@ CV_IMPL int cvNamedWindow( const char* name, int flags )
 #ifndef HAVE_OPENGL
     window->dc = CreateCompatibleDC(0);
 #else
-    window->glFuncTab = 0;
     if (!useGl)
     {
         window->dc = CreateCompatibleDC(0);
@@ -1054,14 +780,10 @@ CV_IMPL int cvNamedWindow( const char* name, int flags )
         window->dc = hGLDC;
         window->hGLRC = hGLRC;
         window->useGl = true;
-        initGl(window);
     }
 
     window->glDrawCallback = 0;
     window->glDrawData = 0;
-
-    window->glCleanCallback = 0;
-    window->glCleanData = 0;
 #endif
 
     window->last_key = 0;
@@ -1112,8 +834,6 @@ CV_IMPL void cvSetOpenGlContext(const char* name)
     if (!wglMakeCurrent(window->dc, window->hGLRC))
         CV_ERROR( CV_OpenGlApiCallError, "Can't Activate The GL Rendering Context" );
 
-    icvSetOpenGlFuncTab(window->glFuncTab);
-
     __END__;
 }
 
@@ -1161,30 +881,6 @@ CV_IMPL void cvSetOpenGlDrawCallback(const char* name, CvOpenGlDrawCallback call
     __END__;
 }
 
-void icvSetOpenGlCleanCallback(const char* name, CvOpenGlCleanCallback callback, void* userdata)
-{
-    CV_FUNCNAME( "icvSetOpenGlCleanCallback" );
-
-    __BEGIN__;
-
-    CvWindow* window;
-
-    if (!name)
-        CV_ERROR(CV_StsNullPtr, "NULL name string");
-
-    window = icvFindWindowByName(name);
-    if (!window)
-        EXIT;
-
-    if (window->glCleanCallback)
-        window->glCleanCallback(window->glCleanData);
-
-    window->glCleanCallback = callback;
-    window->glCleanData = userdata;
-
-    __END__;
-}
-
 #endif // HAVE_OPENGL
 
 static void icvRemoveWindow( CvWindow* window )
@@ -1194,18 +890,7 @@ static void icvRemoveWindow( CvWindow* window )
 
 #ifdef HAVE_OPENGL
     if (window->useGl)
-    {
-        wglMakeCurrent(window->dc, window->hGLRC);
-
-        if (window->glCleanCallback)
-        {
-            window->glCleanCallback(window->glCleanData);
-            window->glCleanCallback = 0;
-            window->glCleanData = 0;
-        }
-
         releaseGlContext(window);
-    }
 #endif
 
     if( window->frame )
diff --git a/modules/ts/include/opencv2/ts/ts.hpp b/modules/ts/include/opencv2/ts/ts.hpp
index 061f53a828..094dad4da3 100644
--- a/modules/ts/include/opencv2/ts/ts.hpp
+++ b/modules/ts/include/opencv2/ts/ts.hpp
@@ -1,7 +1,7 @@
 #ifndef __OPENCV_GTESTCV_HPP__
 #define __OPENCV_GTESTCV_HPP__
 
-#if HAVE_CVCONFIG_H
+#ifdef HAVE_CVCONFIG_H
 #include "cvconfig.h"
 #endif
 #ifndef GTEST_CREATE_SHARED_LIBRARY
diff --git a/modules/ts/src/ts_perf.cpp b/modules/ts/src/ts_perf.cpp
index 2052571780..222c0b650e 100644
--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
@@ -1,5 +1,9 @@
 #include "precomp.hpp"
 
+#ifdef HAVE_CUDA
+#include "opencv2/core/gpumat.hpp"
+#endif
+
 #ifdef ANDROID
 # include <sys/time.h>
 #endif
@@ -1159,6 +1163,10 @@ void TestBase::RunPerfTestBody()
     catch(cv::Exception e)
     {
         metrics.terminationReason = performance_metrics::TERM_EXCEPTION;
+        #ifdef HAVE_CUDA
+            if (e.code == CV_GpuApiCallError)
+                cv::gpu::resetDevice();
+        #endif
         FAIL() << "Expected: PerfTestBody() doesn't throw an exception.\n  Actual: it throws cv::Exception:\n  " << e.what();
     }
     catch(std::exception e)
diff --git a/samples/cpp/point_cloud.cpp b/samples/cpp/point_cloud.cpp
deleted file mode 100644
index 3672de1420..0000000000
--- a/samples/cpp/point_cloud.cpp
+++ /dev/null
@@ -1,357 +0,0 @@
-#include <cstring>
-#include <cmath>
-#include <iostream>
-#include <sstream>
-#include "opencv2/core/core.hpp"
-#include "opencv2/core/opengl_interop.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/calib3d/calib3d.hpp"
-#include "opencv2/contrib/contrib.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-class PointCloudRenderer
-{
-public:
-    PointCloudRenderer(const Mat& points, const Mat& img, double scale);
-
-    void onMouseEvent(int event, int x, int y, int flags);
-    void draw();
-    void update(int key, double aspect);
-
-    int fov_;
-
-private:
-    int mouse_dx_;
-    int mouse_dy_;
-
-    double yaw_;
-    double pitch_;
-    Point3d pos_;
-
-    TickMeter tm_;
-    static const int step_;
-    int frame_;
-
-    GlCamera camera_;
-    GlArrays pointCloud_;
-    string fps_;
-};
-
-bool stop = false;
-
-static void mouseCallback(int event, int x, int y, int flags, void* userdata)
-{
-    if (stop)
-        return;
-
-    PointCloudRenderer* renderer = static_cast<PointCloudRenderer*>(userdata);
-    renderer->onMouseEvent(event, x, y, flags);
-}
-
-static void openGlDrawCallback(void* userdata)
-{
-    if (stop)
-        return;
-
-    PointCloudRenderer* renderer = static_cast<PointCloudRenderer*>(userdata);
-    renderer->draw();
-}
-
-int main(int argc, const char* argv[])
-{
-    const char* keys =
-       "{ l left      |       | left image file name }"
-       "{ r right     |       | right image file name }"
-       "{ i intrinsic |       | intrinsic camera parameters file name }"
-       "{ e extrinsic |       | extrinsic camera parameters file name }"
-       "{ d ndisp     | 256   | number of disparities }"
-       "{ s scale     | 1.0   | scale factor for point cloud }"
-       "{ h help      |       | print help message }";
-
-    CommandLineParser cmd(argc, argv, keys);
-
-    if (cmd.has("help"))
-    {
-        cmd.printMessage();
-        return 0;
-    }
-
-    string left = cmd.get<string>("left");
-    string right = cmd.get<string>("right");
-    string intrinsic = cmd.get<string>("intrinsic");
-    string extrinsic = cmd.get<string>("extrinsic");
-    int ndisp = cmd.get<int>("ndisp");
-    double scale = cmd.get<double>("scale");
-
-    if (!cmd.check())
-    {
-        cmd.printErrors();
-        return 0;
-    }
-
-
-    if (left.empty() || right.empty())
-    {
-        cout << "Missed input images" << endl;
-        cout << "Avaible options:" << endl;
-        cmd.printMessage();
-        return 0;
-    }
-
-    if (intrinsic.empty() ^ extrinsic.empty())
-    {
-        cout << "Boss camera parameters must be specified" << endl;
-        cout << "Avaible options:" << endl;
-        cmd.printMessage();
-        return 0;
-    }
-
-    Mat imgLeftColor = imread(left, IMREAD_COLOR);
-    Mat imgRightColor = imread(right, IMREAD_COLOR);
-
-    if (imgLeftColor.empty())
-    {
-        cout << "Can't load image " << left << endl;
-        return -1;
-    }
-
-    if (imgRightColor.empty())
-    {
-        cout << "Can't load image " << right << endl;
-        return -1;
-    }
-
-    Mat Q = Mat::eye(4, 4, CV_32F);
-    if (!intrinsic.empty() && !extrinsic.empty())
-    {
-        FileStorage fs;
-
-        // reading intrinsic parameters
-        fs.open(intrinsic, CV_STORAGE_READ);
-        if (!fs.isOpened())
-        {
-            cout << "Failed to open file " << intrinsic << endl;
-            return -1;
-        }
-
-        Mat M1, D1, M2, D2;
-        fs["M1"] >> M1;
-        fs["D1"] >> D1;
-        fs["M2"] >> M2;
-        fs["D2"] >> D2;
-
-        // reading extrinsic parameters
-        fs.open(extrinsic, CV_STORAGE_READ);
-        if (!fs.isOpened())
-        {
-            cout << "Failed to open file " << extrinsic << endl;
-            return -1;
-        }
-
-        Mat R, T, R1, P1, R2, P2;
-        fs["R"] >> R;
-        fs["T"] >> T;
-
-        Size img_size = imgLeftColor.size();
-
-        Rect roi1, roi2;
-        stereoRectify(M1, D1, M2, D2, img_size, R, T, R1, R2, P1, P2, Q, CALIB_ZERO_DISPARITY, -1, img_size, &roi1, &roi2);
-
-        Mat map11, map12, map21, map22;
-        initUndistortRectifyMap(M1, D1, R1, P1, img_size, CV_16SC2, map11, map12);
-        initUndistortRectifyMap(M2, D2, R2, P2, img_size, CV_16SC2, map21, map22);
-
-        Mat img1r, img2r;
-        remap(imgLeftColor, img1r, map11, map12, INTER_LINEAR);
-        remap(imgRightColor, img2r, map21, map22, INTER_LINEAR);
-
-        imgLeftColor = img1r(roi1);
-        imgRightColor = img2r(roi2);
-    }
-
-    Mat imgLeftGray, imgRightGray;
-    cvtColor(imgLeftColor, imgLeftGray, COLOR_BGR2GRAY);
-    cvtColor(imgRightColor, imgRightGray, COLOR_BGR2GRAY);
-
-    cvtColor(imgLeftColor, imgLeftColor, COLOR_BGR2RGB);
-
-    Mat disp, points;
-
-    StereoBM bm(0, ndisp);
-
-    bm(imgLeftGray, imgRightGray, disp);
-    disp.convertTo(disp, CV_8U, 1.0 / 16.0);
-
-    disp = disp(Range(21, disp.rows - 21), Range(ndisp, disp.cols - 21)).clone();
-    imgLeftColor = imgLeftColor(Range(21, imgLeftColor.rows - 21), Range(ndisp, imgLeftColor.cols - 21)).clone();
-
-    reprojectImageTo3D(disp, points, Q);
-
-    const string windowName = "OpenGL Sample";
-
-    namedWindow(windowName, WINDOW_OPENGL);
-    resizeWindow(windowName, 400, 400);
-
-    PointCloudRenderer renderer(points, imgLeftColor, scale);
-
-    createTrackbar("Fov", windowName, &renderer.fov_, 100);
-    setMouseCallback(windowName, mouseCallback, &renderer);
-    setOpenGlDrawCallback(windowName, openGlDrawCallback, &renderer);
-
-    for(;;)
-    {
-        int key = waitKey(10);
-
-        if (key >= 0)
-            key = key & 0xff;
-
-        if (key == 27)
-        {
-            stop = true;
-            break;
-        }
-
-        double aspect = getWindowProperty(windowName, WND_PROP_ASPECT_RATIO);
-
-        key = tolower(key);
-
-        renderer.update(key, aspect);
-
-        updateWindow(windowName);
-    }
-
-    return 0;
-}
-
-const int PointCloudRenderer::step_ = 20;
-
-PointCloudRenderer::PointCloudRenderer(const Mat& points, const Mat& img, double scale)
-{
-    mouse_dx_ = 0;
-    mouse_dy_ = 0;
-
-    fov_ = 0;
-    yaw_ = 0.0;
-    pitch_ = 0.0;
-
-    frame_ = 0;
-
-    camera_.setScale(Point3d(scale, scale, scale));
-
-    pointCloud_.setVertexArray(points);
-    pointCloud_.setColorArray(img, false);
-
-    tm_.start();
-}
-
-inline int clamp(int val, int minVal, int maxVal)
-{
-    return max(min(val, maxVal), minVal);
-}
-
-void PointCloudRenderer::onMouseEvent(int event, int x, int y, int /*flags*/)
-{
-    static int oldx = x;
-    static int oldy = y;
-    static bool moving = false;
-
-    if (event == EVENT_LBUTTONDOWN)
-    {
-        oldx = x;
-        oldy = y;
-        moving = true;
-    }
-    else if (event == EVENT_LBUTTONUP)
-    {
-        moving = false;
-    }
-
-    if (moving)
-    {
-        mouse_dx_ = oldx - x;
-        mouse_dy_ = oldy - y;
-    }
-    else
-    {
-        mouse_dx_ = 0;
-        mouse_dy_ = 0;
-    }
-
-    const int mouseClamp = 300;
-    mouse_dx_ = clamp(mouse_dx_, -mouseClamp, mouseClamp);
-    mouse_dy_ = clamp(mouse_dy_, -mouseClamp, mouseClamp);
-}
-
-static Point3d rotate(Point3d v, double yaw, double pitch)
-{
-    Point3d t1;
-    t1.x = v.x * cos(-yaw / 180.0 * CV_PI) - v.z * sin(-yaw / 180.0 * CV_PI);
-    t1.y = v.y;
-    t1.z = v.x * sin(-yaw / 180.0 * CV_PI) + v.z * cos(-yaw / 180.0 * CV_PI);
-
-    Point3d t2;
-    t2.x = t1.x;
-    t2.y = t1.y * cos(pitch / 180.0 * CV_PI) - t1.z * sin(pitch / 180.0 * CV_PI);
-    t2.z = t1.y * sin(pitch / 180.0 * CV_PI) + t1.z * cos(pitch / 180.0 * CV_PI);
-
-    return t2;
-}
-
-void PointCloudRenderer::update(int key, double aspect)
-{
-    const Point3d dirVec(0.0, 0.0, -1.0);
-    const Point3d upVec(0.0, 1.0, 0.0);
-    const Point3d leftVec(-1.0, 0.0, 0.0);
-
-    const double posStep = 0.1;
-
-    const double mouseStep = 0.001;
-
-    camera_.setPerspectiveProjection(30.0 + fov_ / 100.0 * 40.0, aspect, 0.1, 1000.0);
-
-    yaw_ += mouse_dx_ * mouseStep;
-    pitch_ += mouse_dy_ * mouseStep;
-
-    if (key == 'w')
-        pos_ += posStep * rotate(dirVec, yaw_, pitch_);
-    else if (key == 's')
-        pos_ -= posStep * rotate(dirVec, yaw_, pitch_);
-    else if (key == 'a')
-        pos_ += posStep * rotate(leftVec, yaw_, pitch_);
-    else if (key == 'd')
-        pos_ -= posStep * rotate(leftVec, yaw_, pitch_);
-    else if (key == 'q')
-        pos_ += posStep * rotate(upVec, yaw_, pitch_);
-    else if (key == 'e')
-        pos_ -= posStep * rotate(upVec, yaw_, pitch_);
-
-    camera_.setCameraPos(pos_, yaw_, pitch_, 0.0);
-
-    tm_.stop();
-
-    if (frame_++ >= step_)
-    {
-        ostringstream ostr;
-        ostr << "FPS: " << step_ / tm_.getTimeSec();
-        fps_ = ostr.str();
-
-        frame_ = 0;
-        tm_.reset();
-    }
-
-    tm_.start();
-}
-
-void PointCloudRenderer::draw()
-{
-    camera_.setupProjectionMatrix();
-    camera_.setupModelViewMatrix();
-
-    render(pointCloud_);
-
-    render(fps_, GlFont::get("Courier New", 16), Scalar::all(255), Point2d(3.0, 0.0));
-}
diff --git a/samples/gpu/brox_optical_flow.cpp b/samples/gpu/brox_optical_flow.cpp
index b25e38a9f0..143d0b4853 100644
--- a/samples/gpu/brox_optical_flow.cpp
+++ b/samples/gpu/brox_optical_flow.cpp
@@ -4,7 +4,6 @@
 
 #include "cvconfig.h"
 #include "opencv2/core/core.hpp"
-#include "opencv2/core/opengl_interop.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/gpu/gpu.hpp"
 
@@ -14,12 +13,6 @@ using namespace cv::gpu;
 
 void getFlowField(const Mat& u, const Mat& v, Mat& flowField);
 
-#ifdef HAVE_OPENGL
-
-void needleMapDraw(void* userdata);
-
-#endif
-
 int main(int argc, const char* argv[])
 {
     try
@@ -79,12 +72,8 @@ int main(int argc, const char* argv[])
         namedWindow("Forward flow");
         namedWindow("Backward flow");
 
-        namedWindow("Needle Map", WINDOW_OPENGL);
-
         namedWindow("Interpolated frame");
 
-        setGlDevice();
-
         cout << "Press:" << endl;
         cout << "\tESC to quit" << endl;
         cout << "\t'a' to move to the previous frame" << endl;
@@ -123,14 +112,6 @@ int main(int argc, const char* argv[])
         Mat flowFieldBackward;
         getFlowField(Mat(d_bu), Mat(d_bv), flowFieldBackward);
 
-#ifdef HAVE_OPENGL
-        cout << "Create Optical Flow Needle Map..." << endl;
-
-        GpuMat d_vertex, d_colors;
-
-        createOpticalFlowNeedleMap(d_fu, d_fv, d_vertex, d_colors);
-#endif
-
         cout << "Interpolating..." << endl;
 
         // first frame color components
@@ -195,14 +176,6 @@ int main(int argc, const char* argv[])
         imshow("Forward flow", flowFieldForward);
         imshow("Backward flow", flowFieldBackward);
 
-#ifdef HAVE_OPENGL
-        GlArrays arr;
-        arr.setVertexArray(d_vertex);
-        arr.setColorArray(d_colors, false);
-
-        setOpenGlDrawCallback("Needle Map", needleMapDraw, &arr);
-#endif
-
         int currentFrame = 0;
 
         imshow("Interpolated frame", frames[currentFrame]);
@@ -292,21 +265,3 @@ void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
         }
     }
 }
-
-#ifdef HAVE_OPENGL
-
-void needleMapDraw(void* userdata)
-{
-    const GlArrays* arr = static_cast<const GlArrays*>(userdata);
-
-    GlCamera camera;
-    camera.setOrthoProjection(0.0, 1.0, 1.0, 0.0, 0.0, 1.0);
-    camera.lookAt(Point3d(0.0, 0.0, 1.0), Point3d(0.0, 0.0, 0.0), Point3d(0.0, 1.0, 0.0));
-
-    camera.setupProjectionMatrix();
-    camera.setupModelViewMatrix();
-
-    render(*arr, RenderMode::TRIANGLES);
-}
-
-#endif
diff --git a/samples/gpu/cascadeclassifier_nvidia_api.cpp b/samples/gpu/cascadeclassifier_nvidia_api.cpp
index da98643af5..99c95ab977 100644
--- a/samples/gpu/cascadeclassifier_nvidia_api.cpp
+++ b/samples/gpu/cascadeclassifier_nvidia_api.cpp
@@ -30,7 +30,7 @@ const Size2i preferredVideoFrameSize(640, 480);
 const string wndTitle = "NVIDIA Computer Vision :: Haar Classifiers Cascade";
 
 
-void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
+static void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
 {
     int fontFace = FONT_HERSHEY_DUPLEX;
     double fontScale = 0.8;
@@ -45,7 +45,7 @@ void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
 }
 
 
-void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps)
+static void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps)
 {
     Scalar fontColorRed = CV_RGB(255,0,0);
     Scalar fontColorNV  = CV_RGB(118,185,0);
@@ -74,7 +74,7 @@ void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bF
 }
 
 
-NCVStatus process(Mat *srcdst,
+static NCVStatus process(Mat *srcdst,
                   Ncv32u width, Ncv32u height,
                   NcvBool bFilterRects, NcvBool bLargestFace,
                   HaarClassifierCascadeDescriptor &haar,
@@ -281,7 +281,7 @@ int main(int argc, const char** argv)
     //==============================================================================
 
     namedWindow(wndTitle, 1);
-    Mat gray, frameDisp;
+    Mat frameDisp;
 
     do
     {
diff --git a/samples/gpu/driver_api_multi.cpp b/samples/gpu/driver_api_multi.cpp
index 560908ce63..2d743f0e9c 100644
--- a/samples/gpu/driver_api_multi.cpp
+++ b/samples/gpu/driver_api_multi.cpp
@@ -54,14 +54,8 @@ inline void safeCall_(int code, const char* expr, const char* file, int line)
 // Each GPU is associated with its own context
 CUcontext contexts[2];
 
-int main(int argc, char **argv)
+int main()
 {
-    if (argc > 1)
-    {
-        cout << "CUDA driver API sample\n";
-        return -1;
-    }
-
     int num_devices = getCudaEnabledDeviceCount();
     if (num_devices < 2)
     {
diff --git a/samples/gpu/driver_api_stereo_multi.cpp b/samples/gpu/driver_api_stereo_multi.cpp
index 3c663a5e5a..b8f99e810f 100644
--- a/samples/gpu/driver_api_stereo_multi.cpp
+++ b/samples/gpu/driver_api_stereo_multi.cpp
@@ -76,7 +76,7 @@ GpuMat d_result[2];
 // CPU result
 Mat result;
 
-void printHelp()
+static void printHelp()
 {
     std::cout << "Usage: driver_api_stereo_multi_gpu --left <left_image> --right <right_image>\n";
 }
diff --git a/samples/gpu/highgui_gpu.cpp b/samples/gpu/highgui_gpu.cpp
deleted file mode 100644
index 368a6f6084..0000000000
--- a/samples/gpu/highgui_gpu.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-#include <iostream>
-#include <string>
-
-#include "opencv2/core/core.hpp"
-#include "opencv2/core/gpumat.hpp"
-#include "opencv2/core/opengl_interop.hpp"
-#include "opencv2/gpu/gpu.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/contrib/contrib.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-struct Timer
-{
-    Timer(const string& msg_)
-    {
-        msg = msg_;
-
-        tm.reset();
-        tm.start();
-    }
-
-    ~Timer()
-    {
-        tm.stop();
-        cout << msg << " " << tm.getTimeMilli() << " ms\n";
-    }
-
-    string msg;
-    TickMeter tm;
-};
-
-int main(int argc, char* argv[])
-{
-    if (argc < 2)
-    {
-        cout << "Usage: " << argv[0] << " image" << endl;
-        return -1;
-    }
-
-    try
-    {
-        bool haveCuda = getCudaEnabledDeviceCount() > 0;
-
-        const string openGlMatWnd = "OpenGL Mat";
-        const string openGlBufferWnd = "OpenGL GlBuffer";
-        const string openGlTextureWnd = "OpenGL GlTexture";
-        const string openGlGpuMatWnd = "OpenGL GpuMat";
-        const string matWnd = "Mat";
-
-        namedWindow(openGlMatWnd, WINDOW_OPENGL | WINDOW_AUTOSIZE);
-        namedWindow(openGlBufferWnd, WINDOW_OPENGL | WINDOW_AUTOSIZE);
-        namedWindow(openGlTextureWnd, WINDOW_OPENGL | WINDOW_AUTOSIZE);
-        if (haveCuda)
-            namedWindow(openGlGpuMatWnd, WINDOW_OPENGL | WINDOW_AUTOSIZE);
-        namedWindow("Mat", WINDOW_AUTOSIZE);
-
-        Mat img = imread(argv[1]);
-
-        if (haveCuda)
-            setGlDevice();
-
-        setOpenGlContext(openGlBufferWnd);
-        GlBuffer buf(img, GlBuffer::TEXTURE_BUFFER);
-
-        setOpenGlContext(openGlTextureWnd);
-        GlTexture tex(img);
-
-        GpuMat d_img;
-        if (haveCuda)
-            d_img.upload(img);
-
-        cout << "=== First call\n\n";
-
-        {
-            Timer t("OpenGL Mat      ");
-            imshow(openGlMatWnd, img);
-        }
-        {
-            Timer t("OpenGL GlBuffer ");
-            imshow(openGlBufferWnd, buf);
-        }
-        {
-            Timer t("OpenGL GlTexture");
-            imshow(openGlTextureWnd, tex);
-        }
-        if (haveCuda)
-        {
-            Timer t("OpenGL GpuMat   ");
-            imshow(openGlGpuMatWnd, d_img);
-        }
-        {
-            Timer t("Mat             ");
-            imshow(matWnd, img);
-        }
-
-        waitKey();
-
-        cout << "\n=== Second call\n\n";
-
-        {
-            Timer t("OpenGL Mat      ");
-            imshow(openGlMatWnd, img);
-        }
-        {
-            Timer t("OpenGL GlBuffer ");
-            imshow(openGlBufferWnd, buf);
-        }
-        {
-            Timer t("OpenGL GlTexture");
-            imshow(openGlTextureWnd, tex);
-        }
-        if (haveCuda)
-        {
-            Timer t("OpenGL GpuMat   ");
-            imshow(openGlGpuMatWnd, d_img);
-        }
-        {
-            Timer t("Mat             ");
-            imshow(matWnd, img);
-        }
-
-        cout << "\n";
-
-        waitKey();
-    }
-    catch(const exception& e)
-    {
-        cout << e.what() << endl;
-    }
-
-    return 0;
-}
diff --git a/samples/gpu/houghlines.cpp b/samples/gpu/houghlines.cpp
new file mode 100644
index 0000000000..e98dcc6e7c
--- /dev/null
+++ b/samples/gpu/houghlines.cpp
@@ -0,0 +1,89 @@
+#include <cmath>
+#include <iostream>
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+static void help()
+{
+    cout << "This program demonstrates line finding with the Hough transform." << endl;
+    cout << "Usage:" << endl;
+    cout << "./gpu-example-houghlines <image_name>, Default is pic1.png\n" << endl;
+}
+
+int main(int argc, const char* argv[])
+{
+    const string filename = argc >= 2 ? argv[1] : "pic1.png";
+
+    Mat src = imread(filename, IMREAD_GRAYSCALE);
+    if (src.empty())
+    {
+        help();
+        cout << "can not open " << filename << endl;
+        return -1;
+    }
+
+    Mat mask;
+    Canny(src, mask, 100, 200, 3);
+
+    Mat dst_cpu;
+    cvtColor(mask, dst_cpu, CV_GRAY2BGR);
+    Mat dst_gpu = dst_cpu.clone();
+
+    vector<Vec4i> lines_cpu;
+    {
+        const int64 start = getTickCount();
+
+        HoughLinesP(mask, lines_cpu, 1, CV_PI / 180, 50, 60, 5);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "CPU Time : " << timeSec * 1000 << " ms" << endl;
+        cout << "CPU Found : " << lines_cpu.size() << endl;
+    }
+
+    for (size_t i = 0; i < lines_cpu.size(); ++i)
+    {
+        Vec4i l = lines_cpu[i];
+        line(dst_cpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, CV_AA);
+    }
+
+    GpuMat d_src(mask);
+    GpuMat d_lines;
+    HoughLinesBuf d_buf;
+    {
+        const int64 start = getTickCount();
+
+        gpu::HoughLinesP(d_src, d_lines, d_buf, 1.0f, (float) (CV_PI / 180.0f), 50, 5);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "GPU Time : " << timeSec * 1000 << " ms" << endl;
+        cout << "GPU Found : " << d_lines.cols << endl;
+    }
+    vector<Vec4i> lines_gpu;
+    if (!d_lines.empty())
+    {
+        lines_gpu.resize(d_lines.cols);
+        Mat h_lines(1, d_lines.cols, CV_32SC4, &lines_gpu[0]);
+        d_lines.download(h_lines);
+    }
+
+    for (size_t i = 0; i < lines_gpu.size(); ++i)
+    {
+        Vec4i l = lines_gpu[i];
+        line(dst_gpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, CV_AA);
+    }
+
+    imshow("source", src);
+    imshow("detected lines [CPU]", dst_cpu);
+    imshow("detected lines [GPU]", dst_gpu);
+    waitKey();
+
+    return 0;
+}
+
diff --git a/samples/gpu/opengl.cpp b/samples/gpu/opengl.cpp
new file mode 100644
index 0000000000..ec6fbf8795
--- /dev/null
+++ b/samples/gpu/opengl.cpp
@@ -0,0 +1,127 @@
+#include <iostream>
+#include "cvconfig.h"
+
+#ifndef HAVE_OPENGL
+int main()
+{
+    std::cerr << "Library was built without OpenGL support" << std::endl;
+    return -1;
+}
+#else
+
+#ifdef WIN32
+    #define WIN32_LEAN_AND_MEAN 1
+    #define NOMINMAX 1
+    #include <windows.h>
+#endif
+
+#if defined(__APPLE__)
+    #include <OpenGL/gl.h>
+    #include <OpenGL/glu.h>
+#else
+    #include <GL/gl.h>
+    #include <GL/glu.h>
+#endif
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/opengl_interop.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/highgui/highgui.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+const int win_width = 800;
+const int win_height = 640;
+
+struct DrawData
+{
+    GlArrays arr;
+    GlTexture2D tex;
+    GlBuffer indices;
+};
+
+void CV_CDECL draw(void* userdata);
+
+void CV_CDECL draw(void* userdata)
+{
+    static double angle = 0.0;
+
+    DrawData* data = static_cast<DrawData*>(userdata);
+
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+    gluPerspective(45.0, (double)win_width / win_height, 0.1, 100.0);
+
+    glMatrixMode(GL_MODELVIEW);
+    glLoadIdentity();
+    gluLookAt(0, 0, 3, 0, 0, 0, 0, 1, 0);
+    glRotated(angle, 0, 1, 0);
+
+    glEnable(GL_TEXTURE_2D);
+    data->tex.bind();
+
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+    glTexEnvi(GL_TEXTURE_2D, GL_TEXTURE_ENV_MODE, GL_REPLACE);
+
+    glDisable(GL_CULL_FACE);
+
+    render(data->arr, data->indices, RenderMode::TRIANGLES);
+
+    angle += 0.3;
+}
+
+int main(int argc, char* argv[])
+{
+    if (argc < 2)
+    {
+        cout << "Usage: " << argv[0] << " image" << endl;
+        return -1;
+    }
+
+    Mat img = imread(argv[1]);
+    if (img.empty())
+    {
+        cerr << "Can't open image " << argv[1] << endl;
+        return -1;
+    }
+
+    namedWindow("OpenGL", WINDOW_OPENGL);
+    resizeWindow("OpenGL", win_width, win_height);
+
+    Mat_<Vec2f> vertex(1, 4);
+    vertex << Vec2f(-1, 1), Vec2f(-1, -1), Vec2f(1, -1), Vec2f(1, 1);
+
+    Mat_<Vec2f> texCoords(1, 4);
+    texCoords << Vec2f(0, 0), Vec2f(0, 1), Vec2f(1, 1), Vec2f(1, 0);
+
+    Mat_<int> indices(1, 6);
+    indices << 0, 1, 2, 2, 3, 0;
+
+    DrawData data;
+
+    data.arr.setVertexArray(vertex);
+    data.arr.setTexCoordArray(texCoords);
+    data.indices.copyFrom(indices);
+    data.tex.copyFrom(img);
+
+    setOpenGlDrawCallback("OpenGL", draw, &data);
+
+    for (;;)
+    {
+        updateWindow("OpenGL");
+        int key = waitKey(10);
+        if ((key & 0xff) == 27)
+            break;
+    }
+
+    setOpenGlDrawCallback("OpenGL", 0, 0);
+    destroyAllWindows();
+
+    return 0;
+}
+
+#endif
diff --git a/samples/gpu/optical_flow.cpp b/samples/gpu/optical_flow.cpp
new file mode 100644
index 0000000000..3f74d1b676
--- /dev/null
+++ b/samples/gpu/optical_flow.cpp
@@ -0,0 +1,253 @@
+#include <iostream>
+#include <fstream>
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+inline bool isFlowCorrect(Point2f u)
+{
+    return !cvIsNaN(u.x) && !cvIsNaN(u.y) && fabs(u.x) < 1e9 && fabs(u.y) < 1e9;
+}
+
+static Vec3b computeColor(float fx, float fy)
+{
+    static bool first = true;
+
+    // relative lengths of color transitions:
+    // these are chosen based on perceptual similarity
+    // (e.g. one can distinguish more shades between red and yellow
+    //  than between yellow and green)
+    const int RY = 15;
+    const int YG = 6;
+    const int GC = 4;
+    const int CB = 11;
+    const int BM = 13;
+    const int MR = 6;
+    const int NCOLS = RY + YG + GC + CB + BM + MR;
+    static Vec3i colorWheel[NCOLS];
+
+    if (first)
+    {
+        int k = 0;
+
+        for (int i = 0; i < RY; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 255 * i / RY, 0);
+
+        for (int i = 0; i < YG; ++i, ++k)
+            colorWheel[k] = Vec3i(255 - 255 * i / YG, 255, 0);
+
+        for (int i = 0; i < GC; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255, 255 * i / GC);
+
+        for (int i = 0; i < CB; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255 - 255 * i / CB, 255);
+
+        for (int i = 0; i < BM; ++i, ++k)
+            colorWheel[k] = Vec3i(255 * i / BM, 0, 255);
+
+        for (int i = 0; i < MR; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 0, 255 - 255 * i / MR);
+
+        first = false;
+    }
+
+    const float rad = sqrt(fx * fx + fy * fy);
+    const float a = atan2(-fy, -fx) / (float) CV_PI;
+
+    const float fk = (a + 1.0f) / 2.0f * (NCOLS - 1);
+    const int k0 = static_cast<int>(fk);
+    const int k1 = (k0 + 1) % NCOLS;
+    const float f = fk - k0;
+
+    Vec3b pix;
+
+    for (int b = 0; b < 3; b++)
+    {
+        const float col0 = colorWheel[k0][b] / 255.0f;
+        const float col1 = colorWheel[k1][b] / 255.0f;
+
+        float col = (1 - f) * col0 + f * col1;
+
+        if (rad <= 1)
+            col = 1 - rad * (1 - col); // increase saturation with radius
+        else
+            col *= .75; // out of range
+
+        pix[2 - b] = static_cast<uchar>(255.0 * col);
+    }
+
+    return pix;
+}
+
+static void drawOpticalFlow(const Mat_<float>& flowx, const Mat_<float>& flowy, Mat& dst, float maxmotion = -1)
+{
+    dst.create(flowx.size(), CV_8UC3);
+    dst.setTo(Scalar::all(0));
+
+    // determine motion range:
+    float maxrad = maxmotion;
+
+    if (maxmotion <= 0)
+    {
+        maxrad = 1;
+        for (int y = 0; y < flowx.rows; ++y)
+        {
+            for (int x = 0; x < flowx.cols; ++x)
+            {
+                Point2f u(flowx(y, x), flowy(y, x));
+
+                if (!isFlowCorrect(u))
+                    continue;
+
+                maxrad = max(maxrad, sqrt(u.x * u.x + u.y * u.y));
+            }
+        }
+    }
+
+    for (int y = 0; y < flowx.rows; ++y)
+    {
+        for (int x = 0; x < flowx.cols; ++x)
+        {
+            Point2f u(flowx(y, x), flowy(y, x));
+
+            if (isFlowCorrect(u))
+                dst.at<Vec3b>(y, x) = computeColor(u.x / maxrad, u.y / maxrad);
+        }
+    }
+}
+
+static void showFlow(const char* name, const GpuMat& d_flowx, const GpuMat& d_flowy)
+{
+    Mat flowx(d_flowx);
+    Mat flowy(d_flowy);
+
+    Mat out;
+    drawOpticalFlow(flowx, flowy, out, 10);
+
+    imshow(name, out);
+}
+
+int main(int argc, const char* argv[])
+{
+    if (argc < 3)
+    {
+        cerr << "Usage : " << argv[0] << "<frame0> <frame1>" << endl;
+        return -1;
+    }
+
+    Mat frame0 = imread(argv[1], IMREAD_GRAYSCALE);
+    Mat frame1 = imread(argv[2], IMREAD_GRAYSCALE);
+
+    if (frame0.empty())
+    {
+        cerr << "Can't open image ["  << argv[1] << "]" << endl;
+        return -1;
+    }
+    if (frame1.empty())
+    {
+        cerr << "Can't open image ["  << argv[2] << "]" << endl;
+        return -1;
+    }
+
+    if (frame1.size() != frame0.size())
+    {
+        cerr << "Images should be of equal sizes" << endl;
+        return -1;
+    }
+
+    GpuMat d_frame0(frame0);
+    GpuMat d_frame1(frame1);
+
+    GpuMat d_flowx(frame0.size(), CV_32FC1);
+    GpuMat d_flowy(frame0.size(), CV_32FC1);
+
+    BroxOpticalFlow brox(0.197f, 50.0f, 0.8f, 10, 77, 10);
+    PyrLKOpticalFlow lk; lk.winSize = Size(7, 7);
+    FarnebackOpticalFlow farn;
+    OpticalFlowDual_TVL1_GPU tvl1;
+    FastOpticalFlowBM fastBM;
+
+    {
+        GpuMat d_frame0f;
+        GpuMat d_frame1f;
+
+        d_frame0.convertTo(d_frame0f, CV_32F, 1.0 / 255.0);
+        d_frame1.convertTo(d_frame1f, CV_32F, 1.0 / 255.0);
+
+        const int64 start = getTickCount();
+
+        brox(d_frame0f, d_frame1f, d_flowx, d_flowy);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "Brox : " << timeSec << " sec" << endl;
+
+        showFlow("Brox", d_flowx, d_flowy);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        lk.dense(d_frame0, d_frame1, d_flowx, d_flowy);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "LK : " << timeSec << " sec" << endl;
+
+        showFlow("LK", d_flowx, d_flowy);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        farn(d_frame0, d_frame1, d_flowx, d_flowy);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "Farn : " << timeSec << " sec" << endl;
+
+        showFlow("Farn", d_flowx, d_flowy);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        tvl1(d_frame0, d_frame1, d_flowx, d_flowy);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "TVL1 : " << timeSec << " sec" << endl;
+
+        showFlow("TVL1", d_flowx, d_flowy);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        GpuMat buf;
+        calcOpticalFlowBM(d_frame0, d_frame1, Size(7, 7), Size(1, 1), Size(21, 21), false, d_flowx, d_flowy, buf);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "BM : " << timeSec << " sec" << endl;
+
+        showFlow("BM", d_flowx, d_flowy);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        fastBM(d_frame0, d_frame1, d_flowx, d_flowy);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "Fast BM : " << timeSec << " sec" << endl;
+
+        showFlow("Fast BM", d_flowx, d_flowy);
+    }
+
+    imshow("Frame 0", frame0);
+    imshow("Frame 1", frame1);
+    waitKey();
+
+    return 0;
+}
diff --git a/samples/gpu/opticalflow_nvidia_api.cpp b/samples/gpu/opticalflow_nvidia_api.cpp
index 8a149d7402..05a37ef69d 100644
--- a/samples/gpu/opticalflow_nvidia_api.cpp
+++ b/samples/gpu/opticalflow_nvidia_api.cpp
@@ -59,7 +59,7 @@ public:
 class RgbToR
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char /*b*/, unsigned char /*g*/, unsigned char r)
     {
         return static_cast<float>(r)/255.0f;
     }
@@ -69,7 +69,7 @@ public:
 class RgbToG
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char /*b*/, unsigned char g, unsigned char /*r*/)
     {
         return static_cast<float>(g)/255.0f;
     }
@@ -78,7 +78,7 @@ public:
 class RgbToB
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char b, unsigned char /*g*/, unsigned char /*r*/)
     {
         return static_cast<float>(b)/255.0f;
     }
@@ -135,7 +135,7 @@ NCVStatus CopyData(const IplImage *image, const NCVMatrixAlloc<Ncv32f> &dst)
     return NCV_SUCCESS;
 }
 
-NCVStatus LoadImages (const char *frame0Name,
+static NCVStatus LoadImages (const char *frame0Name,
                       const char *frame1Name,
                       int &width,
                       int &height,
@@ -186,7 +186,7 @@ inline T MapValue (T x, T a, T b, T c, T d)
     return c + (d - c) * (x - a) / (b - a);
 }
 
-NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const char *name)
+static NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const char *name)
 {
     IplImage *flowField;
 
@@ -246,7 +246,7 @@ NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const
     return NCV_SUCCESS;
 }
 
-IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g, NCVMatrixAlloc<Ncv32f> &h_b)
+static IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g, NCVMatrixAlloc<Ncv32f> &h_b)
 {
     CvSize imageSize = cvSize (h_r.width (), h_r.height ());
     IplImage *image  = cvCreateImage (imageSize, IPL_DEPTH_8U, 4);
@@ -270,7 +270,7 @@ IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g,
     return image;
 }
 
-void PrintHelp ()
+static void PrintHelp ()
 {
     std::cout << "Usage help:\n";
     std::cout << std::setiosflags(std::ios::left);
@@ -286,7 +286,7 @@ void PrintHelp ()
     std::cout << "\t" << std::setw(15) << PARAM_HELP << " - display this help message\n";
 }
 
-int ProcessCommandLine(int argc, char **argv,
+static int ProcessCommandLine(int argc, char **argv,
                        Ncv32f &timeStep,
                        char *&frame0Name,
                        char *&frame1Name,
diff --git a/samples/gpu/pyrlk_optical_flow.cpp b/samples/gpu/pyrlk_optical_flow.cpp
index 8792b94956..c2e765da13 100644
--- a/samples/gpu/pyrlk_optical_flow.cpp
+++ b/samples/gpu/pyrlk_optical_flow.cpp
@@ -3,7 +3,6 @@
 
 #include "cvconfig.h"
 #include "opencv2/core/core.hpp"
-#include "opencv2/core/opengl_interop.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/video/video.hpp"
@@ -66,40 +65,6 @@ static void drawArrows(Mat& frame, const vector<Point2f>& prevPts, const vector<
     }
 }
 
-#ifdef HAVE_OPENGL
-
-struct DrawData
-{
-    GlTexture tex;
-    GlArrays arr;
-};
-
-static void drawCallback(void* userdata)
-{
-    DrawData* data = static_cast<DrawData*>(userdata);
-
-    if (data->tex.empty() || data->arr.empty())
-        return;
-
-    static GlCamera camera;
-    static bool init_camera = true;
-
-    if (init_camera)
-    {
-        camera.setOrthoProjection(0.0, 1.0, 1.0, 0.0, 0.0, 1.0);
-        camera.lookAt(Point3d(0.0, 0.0, 1.0), Point3d(0.0, 0.0, 0.0), Point3d(0.0, 1.0, 0.0));
-        init_camera = false;
-    }
-
-    camera.setupProjectionMatrix();
-    camera.setupModelViewMatrix();
-
-    render(data->tex);
-    render(data->arr, RenderMode::TRIANGLES);
-}
-
-#endif
-
 template <typename T> inline T clamp (T x, T a, T b)
 {
     return ((x) > (a) ? ((x) < (b) ? (x) : (b)) : (a));
@@ -199,12 +164,6 @@ int main(int argc, const char* argv[])
     namedWindow("PyrLK [Sparse]", WINDOW_NORMAL);
     namedWindow("PyrLK [Dense] Flow Field", WINDOW_NORMAL);
 
-    #ifdef HAVE_OPENGL
-        namedWindow("PyrLK [Dense]", WINDOW_OPENGL);
-
-        setGlDevice();
-    #endif
-
     cout << "Image size : " << frame0.cols << " x " << frame0.rows << endl;
     cout << "Points count : " << points << endl;
 
@@ -270,21 +229,6 @@ int main(int argc, const char* argv[])
 
     imshow("PyrLK [Dense] Flow Field", flowField);
 
-    #ifdef HAVE_OPENGL
-        setOpenGlContext("PyrLK [Dense]");
-
-        GpuMat d_vertex, d_colors;
-        createOpticalFlowNeedleMap(d_u, d_v, d_vertex, d_colors);
-
-        DrawData drawData;
-
-        drawData.tex.copyFrom(d_frame0Gray);
-        drawData.arr.setVertexArray(d_vertex);
-        drawData.arr.setColorArray(d_colors, false);
-
-        setOpenGlDrawCallback("PyrLK [Dense]", drawCallback, &drawData);
-    #endif
-
     waitKey();
 
     return 0;
diff --git a/samples/gpu/softcascade.cpp b/samples/gpu/softcascade.cpp
new file mode 100644
index 0000000000..66f82d50bd
--- /dev/null
+++ b/samples/gpu/softcascade.cpp
@@ -0,0 +1,105 @@
+#include <opencv2/gpu/gpu.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <iostream>
+
+int main(int argc, char** argv)
+{
+    const std::string keys =
+        "{help h usage ?    |     | print this message }"
+        "{cascade c         |     | path to configuration xml }"
+        "{frames f          |     | path to configuration xml }"
+        "{min_scale         |0.4f | path to configuration xml }"
+        "{max_scale         |5.0f | path to configuration xml }"
+        "{total_scales      |55   | path to configuration xml }"
+        "{device d          |0    | path to configuration xml }"
+    ;
+
+    cv::CommandLineParser parser(argc, argv, keys);
+    parser.about("Soft cascade training application.");
+
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    if (!parser.check())
+    {
+        parser.printErrors();
+        return 1;
+    }
+
+    cv::gpu::setDevice(parser.get<int>("device"));
+
+    std::string cascadePath = parser.get<std::string>("cascade");
+
+    cv::FileStorage fs(cascadePath, cv::FileStorage::READ);
+    if(!fs.isOpened())
+    {
+        std::cout << "Soft Cascade file " << cascadePath << " can't be opened." << std::endl << std::flush;
+        return 1;
+    }
+
+    std::cout << "Read cascade from file " << cascadePath << std::endl;
+
+    float minScale =  parser.get<float>("min_scale");
+    float maxScale =  parser.get<float>("max_scale");
+    int scales     =  parser.get<int>("total_scales");
+
+    using cv::gpu::SCascade;
+    SCascade cascade(minScale, maxScale, scales);
+
+    if (!cascade.load(fs.getFirstTopLevelNode()))
+    {
+        std::cout << "Soft Cascade can't be parsed." << std::endl << std::flush;
+        return 1;
+    }
+
+    std::string frames = parser.get<std::string>("frames");
+    cv::VideoCapture capture(frames);
+    if(!capture.isOpened())
+    {
+        std::cout << "Frame source " << frames << " can't be opened." << std::endl << std::flush;
+        return 1;
+    }
+
+    cv::gpu::GpuMat objects(1, sizeof(SCascade::Detection) * 10000, CV_8UC1);
+    cv::gpu::printShortCudaDeviceInfo(parser.get<int>("device"));
+    for (;;)
+    {
+        cv::Mat frame;
+        if (!capture.read(frame))
+        {
+            std::cout << "Nothing to read. " << std::endl << std::flush;
+            return 0;
+        }
+
+        cv::gpu::GpuMat dframe(frame), roi(frame.rows, frame.cols, CV_8UC1);
+        roi.setTo(cv::Scalar::all(1));
+        cascade.detect(dframe, roi, objects);
+
+        cv::Mat dt(objects);
+        typedef cv::gpu::SCascade::Detection Detection;
+
+        Detection* dts = ((Detection*)dt.data) + 1;
+        int* count = dt.ptr<int>(0);
+
+        std::cout << *count << std::endl;
+
+        cv::Mat result;
+        frame.copyTo(result);
+
+
+        for (int i = 0; i < *count; ++i)
+        {
+            Detection d = dts[i];
+            cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
+        }
+
+        std::cout << "working..." << std::endl;
+        cv::imshow("Soft Cascade demo", result);
+        cv::waitKey(10);
+    }
+
+    return 0;
+}
diff --git a/samples/gpu/stereo_multi.cpp b/samples/gpu/stereo_multi.cpp
index c7fa5539bb..d424bf90b4 100644
--- a/samples/gpu/stereo_multi.cpp
+++ b/samples/gpu/stereo_multi.cpp
@@ -47,7 +47,7 @@ GpuMat d_result[2];
 // CPU result
 Mat result;
 
-void printHelp()
+static void printHelp()
 {
     std::cout << "Usage: stereo_multi_gpu --left <image> --right <image>\n";
 }