Merge pull request #328 from jet47:new-gpu-fixes

2025-06-07 17:44:04 +08:00 · 2013-01-29 11:00:36 +04:00 · 2013-01-29 11:00:36 +04:00 · 11dfceb2c9
commit 11dfceb2c9
parent 2b4ffd1161 395f0201e3
169 changed files with 15976 additions and 12735 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -110,14 +110,15 @@ endif()

 # Optional 3rd party components
 # ===================================================
-OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (UNIX AND NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (UNIX AND NOT ANDROID AND NOT IOS AND NOT CARMA) )
 OCV_OPTION(WITH_AVFOUNDATION   "Use AVFoundation for Video I/O"              ON   IF IOS)
 OCV_OPTION(WITH_CARBON         "Use Carbon for UI instead of Cocoa"          OFF  IF APPLE )
-OCV_OPTION(WITH_CUBLAS         "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_CUDA           "Include NVidia Cuda Runtime support"         ON   IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_CUFFT          "Include NVidia Cuda Fast Fourier Transform (FFT) library support"            ON  IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_CUBLAS         "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_NVCUVID        "Include NVidia Video Decoding library support"                               OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS AND NOT APPLE) )
 OCV_OPTION(WITH_EIGEN          "Include Eigen2/Eigen3 support"               ON)
-OCV_OPTION(WITH_FFMPEG         "Include FFMPEG support"                      ON   IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_FFMPEG         "Include FFMPEG support"                      ON   IF (NOT ANDROID AND NOT IOS))
 OCV_OPTION(WITH_GSTREAMER      "Include Gstreamer support"                   ON   IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_GTK            "Include GTK support"                         ON   IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_IMAGEIO        "ImageIO support for OS X"                    OFF  IF APPLE)
@ -140,9 +141,9 @@ OCV_OPTION(WITH_V4L            "Include Video 4 Linux support"               ON
 OCV_OPTION(WITH_VIDEOINPUT     "Build HighGUI with DirectShow support"       ON   IF WIN32 )
 OCV_OPTION(WITH_XIMEA          "Include XIMEA cameras support"               OFF  IF (NOT ANDROID AND NOT APPLE) )
 OCV_OPTION(WITH_XINE           "Include Xine support (GPL)"                  OFF  IF (UNIX AND NOT APPLE AND NOT ANDROID) )
-OCV_OPTION(WITH_OPENCL         "Include OpenCL Runtime support"              OFF  IF (NOT ANDROID AND NOT IOS) )
-OCV_OPTION(WITH_OPENCLAMDFFT   "Include AMD OpenCL FFT library support"      OFF  IF (NOT ANDROID AND NOT IOS) )
-OCV_OPTION(WITH_OPENCLAMDBLAS  "Include AMD OpenCL BLAS library support"     OFF  IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_OPENCL         "Include OpenCL Runtime support"              OFF  IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
+OCV_OPTION(WITH_OPENCLAMDFFT   "Include AMD OpenCL FFT library support"      OFF  IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
+OCV_OPTION(WITH_OPENCLAMDBLAS  "Include AMD OpenCL BLAS library support"     OFF  IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )


 # OpenCV build components
@ -161,12 +162,12 @@ OCV_OPTION(BUILD_ANDROID_SERVICE    "Build OpenCV Manager for Google Play" OFF I
 OCV_OPTION(BUILD_ANDROID_PACKAGE    "Build platform-specific package for Google Play" OFF IF ANDROID )

 # 3rd party libs
-OCV_OPTION(BUILD_ZLIB               "Build zlib from source"         WIN32 OR APPLE )
-OCV_OPTION(BUILD_TIFF               "Build libtiff from source"      WIN32 OR ANDROID OR APPLE )
-OCV_OPTION(BUILD_JASPER             "Build libjasper from source"    WIN32 OR ANDROID OR APPLE )
-OCV_OPTION(BUILD_JPEG               "Build libjpeg from source"      WIN32 OR ANDROID OR APPLE )
-OCV_OPTION(BUILD_PNG                "Build libpng from source"       WIN32 OR ANDROID OR APPLE )
-OCV_OPTION(BUILD_OPENEXR            "Build openexr from source"      WIN32 OR ANDROID OR APPLE )
+OCV_OPTION(BUILD_ZLIB               "Build zlib from source"         WIN32 OR APPLE OR CARMA )
+OCV_OPTION(BUILD_TIFF               "Build libtiff from source"      WIN32 OR ANDROID OR APPLE OR CARMA )
+OCV_OPTION(BUILD_JASPER             "Build libjasper from source"    WIN32 OR ANDROID OR APPLE OR CARMA )
+OCV_OPTION(BUILD_JPEG               "Build libjpeg from source"      WIN32 OR ANDROID OR APPLE OR CARMA )
+OCV_OPTION(BUILD_PNG                "Build libpng from source"       WIN32 OR ANDROID OR APPLE OR CARMA )
+OCV_OPTION(BUILD_OPENEXR            "Build openexr from source"      WIN32 OR ANDROID OR APPLE OR CARMA )


 # OpenCV installation options
@ -776,8 +777,9 @@ if(HAVE_CUDA)
  status("")
  status("  NVIDIA CUDA")

-  status("    Use CUFFT:"            HAVE_CUFFT  THEN YES ELSE NO)
-  status("    Use CUBLAS:"           HAVE_CUBLAS THEN YES ELSE NO)
+  status("    Use CUFFT:"            HAVE_CUFFT   THEN YES ELSE NO)
+  status("    Use CUBLAS:"           HAVE_CUBLAS  THEN YES ELSE NO)
+  status("    USE NVCUVID:"          HAVE_NVCUVID THEN YES ELSE NO)
  status("    NVIDIA GPU arch:"      ${OPENCV_CUDA_ARCH_BIN})
  status("    NVIDIA PTX archs:"     ${OPENCV_CUDA_ARCH_PTX})
  status("    Use fast math:"        CUDA_FAST_MATH THEN YES ELSE NO)
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -3,17 +3,17 @@ if(${CMAKE_VERSION} VERSION_LESS "2.8.3")
  return()
 endif()

-if (WIN32 AND NOT MSVC)
-  message(STATUS "CUDA compilation is disabled (due to only Visual Studio compiler suppoted on your platform).")
+if(WIN32 AND NOT MSVC)
+  message(STATUS "CUDA compilation is disabled (due to only Visual Studio compiler supported on your platform).")
  return()
 endif()

-if (CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-  message(STATUS "CUDA compilation is disabled (due to Clang unsuppoted on your platform).")
+if(CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+  message(STATUS "CUDA compilation is disabled (due to Clang unsupported on your platform).")
  return()
 endif()

-find_package(CUDA 4.1)
+find_package(CUDA 4.2 QUIET)

 if(CUDA_FOUND)
  set(HAVE_CUDA 1)
@ -26,15 +26,20 @@ if(CUDA_FOUND)
    set(HAVE_CUBLAS 1)
  endif()

-  message(STATUS "CUDA detected: " ${CUDA_VERSION})
-
-  if(${CUDA_VERSION_STRING} VERSION_GREATER "4.1")
-    set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-  else()
-    set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+  if(WITH_NVCUVID)
+    find_cuda_helper_libs(nvcuvid)
+    set(HAVE_NVCUVID 1)
  endif()

-  set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+  message(STATUS "CUDA detected: " ${CUDA_VERSION})
+
+  if (CARMA)
+    set(CUDA_ARCH_BIN "2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+  else()
+    set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "2.0 3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+  endif()

  string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
  string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}")
@ -72,11 +77,20 @@ if(CUDA_FOUND)

  # Tell NVCC to add PTX intermediate code for the specified architectures
  string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_PTX_NO_POINTS}")
-    foreach(ARCH IN LISTS ARCH_LIST)
-      set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=compute_${ARCH})
-      set(OPENCV_CUDA_ARCH_PTX "${OPENCV_CUDA_ARCH_PTX} ${ARCH}")
-      set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
-    endforeach()
+  foreach(ARCH IN LISTS ARCH_LIST)
+    set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=compute_${ARCH})
+    set(OPENCV_CUDA_ARCH_PTX "${OPENCV_CUDA_ARCH_PTX} ${ARCH}")
+    set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
+  endforeach()
+
+  if(CARMA)
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --target-cpu-architecture=ARM" )
+
+    if (CMAKE_VERSION VERSION_LESS 2.8.10)
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -ccbin=${CMAKE_CXX_COMPILER}" )
+    endif()
+
+  endif()

  # These vars will be processed in other scripts
  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA})
@ -84,7 +98,7 @@ if(CUDA_FOUND)

  message(STATUS "CUDA NVCC target flags: ${CUDA_NVCC_FLAGS}")

-  OCV_OPTION(CUDA_FAST_MATH  "Enable --use_fast_math for CUDA compiler " OFF)
+  OCV_OPTION(CUDA_FAST_MATH "Enable --use_fast_math for CUDA compiler " OFF)

  if(CUDA_FAST_MATH)
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math)
@ -92,7 +106,6 @@ if(CUDA_FOUND)

  mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD CUDA_SDK_ROOT_DIR)

-  unset(CUDA_npp_LIBRARY CACHE)
  find_cuda_helper_libs(npp)

  macro(ocv_cuda_compile VAR)
@ -106,15 +119,15 @@ if(CUDA_FOUND)
      string(REPLACE "-ggdb3" "" ${var} "${${var}}")
    endforeach()

-    if (BUILD_SHARED_LIBS)
+    if(BUILD_SHARED_LIBS)
      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -DCVAPI_EXPORTS)
    endif()

    if(UNIX OR APPLE)
-      set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC)
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC)
    endif()
    if(APPLE)
-      set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
    endif()

    # disabled because of multiple warnings during building nvcc auto generated files
--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@ -42,8 +42,9 @@
 set(OpenCV_COMPUTE_CAPABILITIES @OpenCV_CUDA_CC_CONFIGCMAKE@)

 set(OpenCV_CUDA_VERSION @OpenCV_CUDA_VERSION@)
-set(OpenCV_USE_CUBLAS @HAVE_CUBLAS@)
-set(OpenCV_USE_CUFFT  @HAVE_CUFFT@)
+set(OpenCV_USE_CUBLAS   @HAVE_CUBLAS@)
+set(OpenCV_USE_CUFFT    @HAVE_CUFFT@)
+set(OpenCV_USE_NVCUVID  @HAVE_NVCUVID@)

 # Android API level from which OpenCV has been compiled is remembered
 set(OpenCV_ANDROID_NATIVE_API_LEVEL @OpenCV_ANDROID_NATIVE_API_LEVEL_CONFIGCMAKE@)
@ -218,17 +219,22 @@ foreach(__opttype OPT DBG)
  else()
    #TODO: duplicates are annoying but they should not be the problem
  endif()
-  # fix hard coded paths for CUDA libraries under Windows
-  if(WIN32 AND OpenCV_CUDA_VERSION AND NOT OpenCV_SHARED)
+  
+  # CUDA
+  if(OpenCV_CUDA_VERSION AND (CARMA OR (WIN32 AND NOT OpenCV_SHARED)))
    if(NOT CUDA_FOUND)
      find_package(CUDA ${OpenCV_CUDA_VERSION} EXACT REQUIRED)
    else()
      if(NOT CUDA_VERSION_STRING VERSION_EQUAL OpenCV_CUDA_VERSION)
-        message(FATAL_ERROR "OpenCV static library compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
+        if(WIN32)
+          message(FATAL_ERROR "OpenCV static library was compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
+        else()
+          message(FATAL_ERROR "OpenCV library for CARMA was compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
+        endif()
      endif()
    endif()

-    list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY} ${CUDA_nvcuvid_LIBRARY} ${CUDA_nvcuvenc_LIBRARY})
+    list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})

    if(OpenCV_USE_CUBLAS)
      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_CUBLAS_LIBRARIES})
@ -238,6 +244,13 @@ foreach(__opttype OPT DBG)
      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_CUFFT_LIBRARIES})
    endif()

+    if(OpenCV_USE_NVCUVID)
+      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nvcuvid_LIBRARIES})
+    endif()
+
+    if(WIN32)
+      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nvcuvenc_LIBRARIES})
+    endif()
  endif()
 endforeach()

--- a/cmake/templates/cvconfig.h.cmake
+++ b/cmake/templates/cvconfig.h.cmake
@ -175,21 +175,15 @@
 /* NVidia Cuda Runtime API*/
 #cmakedefine HAVE_CUDA

-/* OpenCL Support */
-#cmakedefine HAVE_OPENCL
-
-/* AMD's OpenCL Fast Fourier Transform Library*/
-#cmakedefine HAVE_CLAMDFFT
-
-/* AMD's Basic Linear Algebra Subprograms Library*/
-#cmakedefine HAVE_CLAMDBLAS
-
 /* NVidia Cuda Fast Fourier Transform (FFT) API*/
 #cmakedefine HAVE_CUFFT

 /* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/
 #cmakedefine HAVE_CUBLAS

+/* NVidia Video Decoding API*/
+#cmakedefine HAVE_NVCUVID
+
 /* Compile for 'real' NVIDIA GPU architectures */
 #define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}"

@ -202,6 +196,15 @@
 /* Create PTX or BIN for 1.0 compute capability */
 #cmakedefine CUDA_ARCH_BIN_OR_PTX_10

+/* OpenCL Support */
+#cmakedefine HAVE_OPENCL
+
+/* AMD's OpenCL Fast Fourier Transform Library*/
+#cmakedefine HAVE_CLAMDFFT
+
+/* AMD's Basic Linear Algebra Subprograms Library*/
+#cmakedefine HAVE_CLAMDBLAS
+
 /* VideoInput library */
 #cmakedefine HAVE_VIDEOINPUT

--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@ -10,7 +10,6 @@ if(HAVE_CUDA)
  file(GLOB lib_cuda "src/cuda/*.cu")
  ocv_cuda_compile(cuda_objs ${lib_cuda})

-
  set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 else()
  set(lib_cuda "")
--- a/modules/core/include/opencv2/core/cuda_devptrs.hpp
+++ b/modules/core/include/opencv2/core/cuda_devptrs.hpp
@ -177,6 +177,20 @@ namespace cv
 //#undef __CV_GPU_DEPR_BEFORE__
 //#undef __CV_GPU_DEPR_AFTER__

+        namespace device
+        {
+            using cv::gpu::PtrSz;
+            using cv::gpu::PtrStep;
+            using cv::gpu::PtrStepSz;
+
+            using cv::gpu::PtrStepSzb;
+            using cv::gpu::PtrStepSzf;
+            using cv::gpu::PtrStepSzi;
+
+            using cv::gpu::PtrStepb;
+            using cv::gpu::PtrStepf;
+            using cv::gpu::PtrStepi;
+        }
    }
 }

--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@ -79,6 +79,8 @@ namespace cv { namespace gpu
        WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
    };

+    CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
+
    // Gives information about what GPU archs this OpenCV GPU module was
    // compiled for
    class CV_EXPORTS TargetArchs
--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
@ -44,6 +44,7 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/type_traits.hpp"

 namespace cv { namespace gpu { namespace device
 {
@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device
    void writeScalar(const int*);
    void writeScalar(const float*);
    void writeScalar(const double*);
+    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
    void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
 }}}

@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device
    //////////////////////////////// ConvertTo ////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////

-    template <typename T, typename D> struct Convertor : unary_function<T, D>
+    template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
    {
-        Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
+        Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {}

-        __device__ __forceinline__ D operator()(const T& src) const
+        __device__ __forceinline__ D operator()(typename TypeTraits<T>::ParameterType src) const
        {
            return saturate_cast<D>(alpha * src + beta);
        }

-        double alpha, beta;
+        S alpha, beta;
    };

    namespace detail
@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device
        };
    }

-    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
+    template <typename T, typename D, typename S> struct TransformFunctorTraits< Convertor<T, D, S> > : detail::ConvertTraits< Convertor<T, D, S> >
    {
    };

-    template<typename T, typename D>
+    template<typename T, typename D, typename S>
    void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
    {
        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
-        Convertor<T, D> op(alpha, beta);
+        Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
        cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
    }

@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device
    {
        typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);

-        static const caller_t tab[8][8] =
+        static const caller_t tab[7][7] =
        {
-            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
-            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
-
-            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
-            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
-
-            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
-            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
-
-            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
-            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
-
-            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
-            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
-
-            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
-            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
-
-            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
-            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
-
-            {0,0,0,0,0,0,0,0}
+            {
+                cvt_<uchar, uchar, float>,
+                cvt_<uchar, schar, float>,
+                cvt_<uchar, ushort, float>,
+                cvt_<uchar, short, float>,
+                cvt_<uchar, int, float>,
+                cvt_<uchar, float, float>,
+                cvt_<uchar, double, double>
+            },
+            {
+                cvt_<schar, uchar, float>,
+                cvt_<schar, schar, float>,
+                cvt_<schar, ushort, float>,
+                cvt_<schar, short, float>,
+                cvt_<schar, int, float>,
+                cvt_<schar, float, float>,
+                cvt_<schar, double, double>
+            },
+            {
+                cvt_<ushort, uchar, float>,
+                cvt_<ushort, schar, float>,
+                cvt_<ushort, ushort, float>,
+                cvt_<ushort, short, float>,
+                cvt_<ushort, int, float>,
+                cvt_<ushort, float, float>,
+                cvt_<ushort, double, double>
+            },
+            {
+                cvt_<short, uchar, float>,
+                cvt_<short, schar, float>,
+                cvt_<short, ushort, float>,
+                cvt_<short, short, float>,
+                cvt_<short, int, float>,
+                cvt_<short, float, float>,
+                cvt_<short, double, double>
+            },
+            {
+                cvt_<int, uchar, float>,
+                cvt_<int, schar, float>,
+                cvt_<int, ushort, float>,
+                cvt_<int, short, float>,
+                cvt_<int, int, double>,
+                cvt_<int, float, double>,
+                cvt_<int, double, double>
+            },
+            {
+                cvt_<float, uchar, float>,
+                cvt_<float, schar, float>,
+                cvt_<float, ushort, float>,
+                cvt_<float, short, float>,
+                cvt_<float, int, float>,
+                cvt_<float, float, float>,
+                cvt_<float, double, double>
+            },
+            {
+                cvt_<double, uchar, double>,
+                cvt_<double, schar, double>,
+                cvt_<double, ushort, double>,
+                cvt_<double, short, double>,
+                cvt_<double, int, double>,
+                cvt_<double, float, double>,
+                cvt_<double, double, double>
+            }
        };

        caller_t func = tab[sdepth][ddepth];
-        if (!func)
-            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu");
-
        func(src, dst, alpha, beta, stream);
    }

--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@ -45,8 +45,7 @@
 #include <iostream>

 #ifdef HAVE_CUDA
-    #include <cuda.h>
-    #include <cuda_runtime_api.h>
+    #include <cuda_runtime.h>
    #include <npp.h>

    #define CUDART_MINIMUM_REQUIRED_VERSION 4010
@ -69,33 +68,89 @@ using namespace cv::gpu;

 namespace
 {
-    // Compares value to set using the given comparator. Returns true if
-    // there is at least one element x in the set satisfying to: x cmp value
-    // predicate.
-    template <typename Comparer>
-    bool compareToSet(const std::string& set_as_str, int value, Comparer cmp)
+    class CudaArch
+    {
+    public:
+        CudaArch();
+
+        bool builtWith(FeatureSet feature_set) const;
+        bool hasPtx(int major, int minor) const;
+        bool hasBin(int major, int minor) const;
+        bool hasEqualOrLessPtx(int major, int minor) const;
+        bool hasEqualOrGreaterPtx(int major, int minor) const;
+        bool hasEqualOrGreaterBin(int major, int minor) const;
+
+    private:
+        static void fromStr(const string& set_as_str, vector<int>& arr);
+
+        vector<int> bin;
+        vector<int> ptx;
+        vector<int> features;
+    };
+
+    const CudaArch cudaArch;
+
+    CudaArch::CudaArch()
+    {
+    #ifdef HAVE_CUDA
+        fromStr(CUDA_ARCH_BIN, bin);
+        fromStr(CUDA_ARCH_PTX, ptx);
+        fromStr(CUDA_ARCH_FEATURES, features);
+    #endif
+    }
+
+    bool CudaArch::builtWith(FeatureSet feature_set) const
+    {
+        return !features.empty() && (features.back() >= feature_set);
+    }
+
+    bool CudaArch::hasPtx(int major, int minor) const
+    {
+        return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
+    }
+
+    bool CudaArch::hasBin(int major, int minor) const
+    {
+        return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
+    }
+
+    bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.front() <= major * 10 + minor);
+    }
+
+    bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.back() >= major * 10 + minor);
+    }
+
+    bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
+    {
+        return !bin.empty() && (bin.back() >= major * 10 + minor);
+    }
+
+    void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
    {
        if (set_as_str.find_first_not_of(" ") == string::npos)
-            return false;
+            return;

-        std::stringstream stream(set_as_str);
+        istringstream stream(set_as_str);
        int cur_value;

        while (!stream.eof())
        {
            stream >> cur_value;
-            if (cmp(cur_value, value))
-                return true;
+            arr.push_back(cur_value);
        }

-        return false;
+        sort(arr.begin(), arr.end());
    }
 }

 bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_FEATURES, feature_set, std::greater_equal<int>());
+    return cudaArch.builtWith(feature_set);
 #else
    (void)feature_set;
    return false;
@ -110,7 +165,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor)
 bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::equal_to<int>());
+    return cudaArch.hasPtx(major, minor);
 #else
    (void)major;
    (void)minor;
@ -121,7 +176,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
 bool cv::gpu::TargetArchs::hasBin(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, std::equal_to<int>());
+    return cudaArch.hasBin(major, minor);
 #else
    (void)major;
    (void)minor;
@ -132,8 +187,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor)
 bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor,
-                     std::less_equal<int>());
+    return cudaArch.hasEqualOrLessPtx(major, minor);
 #else
    (void)major;
    (void)minor;
@ -143,14 +197,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)

 bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
 {
-    return hasEqualOrGreaterPtx(major, minor) ||
-           hasEqualOrGreaterBin(major, minor);
+    return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
 }

 bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::greater_equal<int>());
+    return cudaArch.hasEqualOrGreaterPtx(major, minor);
 #else
    (void)major;
    (void)minor;
@ -161,8 +214,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
 bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor,
-                     std::greater_equal<int>());
+    return cudaArch.hasEqualOrGreaterBin(major, minor);
 #else
    (void)major;
    (void)minor;
@ -170,6 +222,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
 #endif
 }

+bool cv::gpu::deviceSupports(FeatureSet feature_set)
+{
+    static int versions[] =
+    {
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+    };
+    static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
+
+    const int devId = getDevice();
+
+    int version;
+
+    if (devId < cache_size && versions[devId] >= 0)
+        version = versions[devId];
+    else
+    {
+        DeviceInfo dev(devId);
+        version = dev.majorVersion() * 10 + dev.minorVersion();
+        if (devId < cache_size)
+            versions[devId] = version;
+    }
+
+    return TargetArchs::builtWith(feature_set) && (version >= feature_set);
+}
+
 #if !defined (HAVE_CUDA)

 #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
@ -315,18 +392,6 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory)

 namespace
 {
-    template <class T> void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
-    {
-        *attribute = T();
-        //CUresult error = CUDA_SUCCESS;// = cuDeviceGetAttribute( attribute, device_attribute, device ); why link erros under ubuntu??
-        CUresult error = cuDeviceGetAttribute( attribute, device_attribute, device );
-        if( CUDA_SUCCESS == error )
-            return;
-
-        printf("Driver API error = %04d\n", error);
-        cv::gpu::error("driver API error", __FILE__, __LINE__);
-    }
-
    int convertSMVer2Cores(int major, int minor)
    {
        // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
@ -335,7 +400,7 @@ namespace
            int Cores;
        } SMtoCores;

-        SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, { -1, -1 }  };
+        SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };

        int index = 0;
        while (gpuArchCoresPerSM[index].SM != -1)
@ -344,7 +409,7 @@ namespace
                return gpuArchCoresPerSM[index].Cores;
            index++;
        }
-        printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
+
        return -1;
    }
 }
@ -382,22 +447,13 @@ void cv::gpu::printCudaDeviceInfo(int device)
        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
        printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
-        printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n",
-            prop.multiProcessorCount, convertSMVer2Cores(prop.major, prop.minor),
-            convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);
+
+        int cores = convertSMVer2Cores(prop.major, prop.minor);
+        if (cores > 0)
+            printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
+
        printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);

-        // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
-        int memoryClock, memBusWidth, L2CacheSize;
-        getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );
-        getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev );
-        getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev );
-
-        printf("  Memory Clock rate:                             %.2f Mhz\n", memoryClock * 1e-3f);
-        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
-        if (L2CacheSize)
-            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
-
        printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
            prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
            prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
@ -457,7 +513,12 @@ void cv::gpu::printShortCudaDeviceInfo(int device)

        const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
        printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
-        printf(", sm_%d%d%s, %d cores", prop.major, prop.minor, arch_str, convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);
+        printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
+
+        int cores = convertSMVer2Cores(prop.major, prop.minor);
+        if (cores > 0)
+            printf(", %d cores", cores * prop.multiProcessorCount);
+
        printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
    }
    fflush(stdout);
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@ -5,7 +5,7 @@ endif()
 set(the_description "GPU-accelerated Computer Vision")
 ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree opencv_photo opencv_legacy)

-ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda" "${CMAKE_CURRENT_SOURCE_DIR}/../highgui/src")
+ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")

 file(GLOB lib_hdrs               "include/opencv2/${name}/*.hpp"               "include/opencv2/${name}/*.h")
 file(GLOB lib_device_hdrs        "include/opencv2/${name}/device/*.hpp"        "include/opencv2/${name}/device/*.h")
@ -15,24 +15,21 @@ file(GLOB lib_cuda_hdrs          "src/cuda/*.hpp" "src/cuda/*.h")
 file(GLOB lib_srcs               "src/*.cpp")
 file(GLOB lib_cuda               "src/cuda/*.cu*")

-source_group("Include" FILES ${lib_hdrs})
-source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs})
-source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs})
-source_group("Device" FILES ${lib_device_hdrs})
+source_group("Include"        FILES ${lib_hdrs})
+source_group("Src\\Host"      FILES ${lib_srcs} ${lib_int_hdrs})
+source_group("Src\\Cuda"      FILES ${lib_cuda} ${lib_cuda_hdrs})
+source_group("Device"         FILES ${lib_device_hdrs})
 source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})

 if (HAVE_CUDA)
-  file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")
+  file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp" "src/nvidia/*.h*")
  file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
-  file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h")
-  set(ncv_files ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
+  set(ncv_files ${ncv_srcs} ${ncv_cuda})

  source_group("Src\\NVidia" FILES ${ncv_files})
  ocv_include_directories("src/nvidia" "src/nvidia/core" "src/nvidia/NPP_staging" ${CUDA_INCLUDE_DIRS})
  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations /wd4211 /wd4201 /wd4100 /wd4505 /wd4408)
  string(REPLACE "-Wsign-promo" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-
-  #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
  #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")

  if(MSVC)
@ -47,23 +44,18 @@ if (HAVE_CUDA)

  ocv_cuda_compile(cuda_objs ${lib_cuda} ${ncv_cuda})

-  #CUDA_BUILD_CLEAN_TARGET()
-
  set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})

-  if(NOT APPLE)
-    unset(CUDA_nvcuvid_LIBRARY CACHE)
-    find_cuda_helper_libs(nvcuvid)
+  if(WITH_NVCUVID)
    set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY})
  endif()

  if(WIN32)
-    unset(CUDA_nvcuvenc_LIBRARY CACHE)
    find_cuda_helper_libs(nvcuvenc)
    set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvenc_LIBRARY})
  endif()

-  if(NOT APPLE AND WITH_FFMPEG)
+  if(WITH_FFMPEG)
    set(cuda_link_libs ${cuda_link_libs} ${HIGHGUI_LIBRARIES})
  endif()
 else()
--- a/modules/gpu/include/opencv2/gpu/device/block.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/block.hpp
--- a/modules/gpu/include/opencv2/gpu/device/color.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/color.hpp
@ -216,6 +216,86 @@ namespace cv { namespace gpu { namespace device
    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)

    #undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab, 3, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab, 4, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab4, 3, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab4, 4, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab, 3, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab, 4, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab4, 3, 4, true, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab4, 4, 4, true, 0)
+
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab, 3, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab, 4, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab4, 3, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab4, 4, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab, 3, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab, 4, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab4, 3, 4, false, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab4, 4, 4, false, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgb, 3, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgb, 4, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgba, 3, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgba, 4, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgr, 3, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgr, 4, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgra, 3, 4, true, 0)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgra, 4, 4, true, 0)
+
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgb, 3, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgb, 4, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgba, 3, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgba, 4, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgr, 3, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgr, 4, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgra, 3, 4, false, 0)
+    OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgra, 4, 4, false, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv, 3, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv, 4, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv4, 3, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv4, 4, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv, 3, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv, 4, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv4, 3, 4, true, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv4, 4, 4, true, 0)
+
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv, 3, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv, 4, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv4, 3, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv4, 4, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv, 3, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv, 4, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv4, 3, 4, false, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv4, 4, 4, false, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgb, 3, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgb, 4, 3, true, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgba, 3, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgba, 4, 4, true, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgr, 3, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgr, 4, 3, true, 0)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgra, 3, 4, true, 0)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgra, 4, 4, true, 0)
+
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgb, 3, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgb, 4, 3, false, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgba, 3, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgba, 4, 4, false, 2)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgr, 3, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgr, 4, 3, false, 0)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgra, 3, 4, false, 0)
+    OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgra, 4, 4, false, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS
 }}} // namespace cv { namespace gpu { namespace device

 #endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/common.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/common.hpp
@ -85,8 +85,6 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int
        cv::gpu::error(cudaGetErrorString(err), file, line, func);
 }

-#ifdef __CUDACC__
-
 namespace cv { namespace gpu
 {
    __host__ __device__ __forceinline__ int divUp(int total, int grain)
@ -96,19 +94,25 @@ namespace cv { namespace gpu

    namespace device
    {
+        using cv::gpu::divUp;
+
+#ifdef __CUDACC__
        typedef unsigned char uchar;
        typedef unsigned short ushort;
        typedef signed char schar;
-        typedef unsigned int uint;
+        #ifdef _WIN32
+            typedef unsigned int uint;
+        #endif

        template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
        {
            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
            cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
        }
+#endif // __CUDACC__
    }
 }}

-#endif // __CUDACC__
+

 #endif // __OPENCV_GPU_COMMON_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
@ -0,0 +1,361 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__
+#define __OPENCV_GPU_REDUCE_DETAIL_HPP__
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace reduce_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val);
+
+                For<I + 1, N>::loadToSmem(smem, val, tid);
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(val) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, val, tid);
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::merge(smem, val, tid, delta, op);
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
+                thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::mergeShfl(val, delta, width, op);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+        };
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            smem[tid] = val;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            val = smem[tid];
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                       const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                       unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                         const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                         unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
+        }
+
+        template <typename T, class Op>
+        __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
+        {
+            T reg = smem[tid + delta];
+            smem[tid] = val = op(val, reg);
+        }
+        template <typename T, class Op>
+        __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
+        {
+            T reg = shfl_down(val, delta, width);
+            val = op(val, reg);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                              const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                              unsigned int tid,
+                                              unsigned int delta,
+                                              const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
+        }
+        template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                  unsigned int delta,
+                                                  unsigned int width,
+                                                  const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
+        }
+
+        template <unsigned int N> struct Generic
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                loadToSmem(smem, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(smem, val, tid, 1024, op);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(smem, val, tid, 512, op);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(smem, val, tid, 256, op);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(smem, val, tid, 128, op);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(smem, val, tid, 64, op);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(smem, val, tid, 32, op);
+                }
+
+                if (tid < 16)
+                {
+                    merge(smem, val, tid, 16, op);
+                    merge(smem, val, tid, 8, op);
+                    merge(smem, val, tid, 4, op);
+                    merge(smem, val, tid, 2, op);
+                    merge(smem, val, tid, 1, op);
+                }
+            }
+        };
+
+        template <unsigned int I, typename Pointer, typename Reference, class Op>
+        struct Unroll
+        {
+            static __device__ void loopShfl(Reference val, Op op, unsigned int N)
+            {
+                mergeShfl(val, I, N, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            }
+            static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                merge(smem, val, tid, I, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            }
+        };
+        template <typename Pointer, typename Reference, class Op>
+        struct Unroll<0, Pointer, Reference, Op>
+        {
+            static __device__ void loopShfl(Reference, Op, unsigned int)
+            {
+            }
+            static __device__ void loop(Pointer, Reference, unsigned int, Op)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+            #if __CUDA_ARCH__ >= 300
+                (void) smem;
+                (void) tid;
+
+                Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if __CUDA_ARCH__ >= 300
+                Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(smem, val, tid);
+
+                if (tid < 32)
+                {
+                #if __CUDA_ARCH__ >= 300
+                    Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
+                #else
+                    Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
@ -0,0 +1,498 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
+#define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace reduce_key_val_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(data);
+
+                For<I + 1, N>::loadToSmem(smem, data, tid);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(data) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, data, tid);
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
+            {
+                thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+
+                For<I + 1, N>::copyShfl(val, delta, width);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
+            {
+                thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+
+                For<I + 1, N>::copy(svals, val, tid, delta);
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(key) = reg;
+                    thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+                }
+
+                For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
+                                         const ValPointerTuple& svals, const ValReferenceTuple& val,
+                                         const CmpTuple& cmp,
+                                         unsigned int tid, unsigned int delta)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
+                    thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+                }
+
+                For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
+            {
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
+            {
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
+                                         const ValPointerTuple&, const ValReferenceTuple&,
+                                         const CmpTuple&,
+                                         unsigned int, unsigned int)
+            {
+            }
+        };
+
+        //////////////////////////////////////////////////////
+        // loadToSmem
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            smem[tid] = data;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            data = smem[tid];
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                   const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                   unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                     const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                     unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
+        }
+
+        //////////////////////////////////////////////////////
+        // copyVals
+
+        template <typename V>
+        __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
+        {
+            val = shfl_down(val, delta, width);
+        }
+        template <typename V>
+        __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
+        {
+            svals[tid] = val = svals[tid + delta];
+        }
+        template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                     unsigned int delta,
+                                                     int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
+        }
+
+        //////////////////////////////////////////////////////
+        // merge
+
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename K,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const Cmp& cmp,
+                                                  unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                                  unsigned int delta, int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
+        }
+        template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+                  typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                              const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                              unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
+        }
+
+        //////////////////////////////////////////////////////
+        // Generic
+
+        template <unsigned int N> struct Generic
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                loadToSmem(skeys, key, tid);
+                loadValsToSmem(svals, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(skeys, key, svals, val, cmp, tid, 1024);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(skeys, key, svals, val, cmp, tid, 512);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(skeys, key, svals, val, cmp, tid, 256);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(skeys, key, svals, val, cmp, tid, 128);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(skeys, key, svals, val, cmp, tid, 64);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(skeys, key, svals, val, cmp, tid, 32);
+                }
+
+                if (tid < 16)
+                {
+                    merge(skeys, key, svals, val, cmp, tid, 16);
+                    merge(skeys, key, svals, val, cmp, tid, 8);
+                    merge(skeys, key, svals, val, cmp, tid, 4);
+                    merge(skeys, key, svals, val, cmp, tid, 2);
+                    merge(skeys, key, svals, val, cmp, tid, 1);
+                }
+            }
+        };
+
+        template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll
+        {
+            static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
+            {
+                mergeShfl(key, val, cmp, I, N);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            }
+            static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                merge(skeys, key, svals, val, cmp, tid, I);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            }
+        };
+        template <class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll<0, KP, KR, VP, VR, Cmp>
+        {
+            static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
+            {
+            }
+            static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+            #if 0 // __CUDA_ARCH__ >= 300
+                (void) skeys;
+                (void) svals;
+                (void) tid;
+
+                Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if 0 // __CUDA_ARCH__ >= 300
+                Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(skeys, key, tid);
+
+                if (tid < 32)
+                {
+                #if 0 // __CUDA_ARCH__ >= 300
+                    loadFromSmem(svals, val, tid);
+
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
+                #else
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+#endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp
@ -1,841 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_REDUCTION_DETAIL_HPP__
-#define __OPENCV_GPU_REDUCTION_DETAIL_HPP__
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace utility_detail
-    {
-        ///////////////////////////////////////////////////////////////////////////////
-        // Reductor
-
-        template <int n> struct WarpReductor
-        {
-            template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                if (tid < n)
-                    data[tid] = partial_reduction;
-                if (n > 32) __syncthreads();
-
-                if (n > 32)
-                {
-                    if (tid < n - 32)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
-                    if (tid < 16)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-                else if (n > 16)
-                {
-                    if (tid < n - 16)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                    if (tid < 8)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-                else if (n > 8)
-                {
-                    if (tid < n - 8)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                    if (tid < 4)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-                else if (n > 4)
-                {
-                    if (tid < n - 4)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                    if (tid < 2)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-                else if (n > 2)
-                {
-                    if (tid < n - 2)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                    if (tid < 2)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-            }
-        };
-        template <> struct WarpReductor<64>
-        {
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                data[tid] = partial_reduction;
-                __syncthreads();
-
-                if (tid < 32)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
-                }
-            }
-        };
-        template <> struct WarpReductor<32>
-        {
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                data[tid] = partial_reduction;
-
-                if (tid < 16)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
-                }
-            }
-        };
-        template <> struct WarpReductor<16>
-        {
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                data[tid] = partial_reduction;
-
-                if (tid < 8)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
-                }
-            }
-        };
-        template <> struct WarpReductor<8>
-        {
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                data[tid] = partial_reduction;
-
-                if (tid < 4)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
-                }
-            }
-        };
-
-        template <bool warp> struct ReductionDispatcher;
-        template <> struct ReductionDispatcher<true>
-        {
-            template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                WarpReductor<n>::reduce(data, partial_reduction, tid, op);
-            }
-        };
-        template <> struct ReductionDispatcher<false>
-        {
-            template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                if (tid < n)
-                    data[tid] = partial_reduction;
-                __syncthreads();
-
-
-                if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }
-                if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }
-                if (n >= 128) { if (tid <  64) { data[tid] = partial_reduction = op(partial_reduction, data[tid +  64]); } __syncthreads(); }
-
-                if (tid < 32)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                }
-            }
-        };
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // PredValWarpReductor
-
-        template <int n> struct PredValWarpReductor;
-        template <> struct PredValWarpReductor<64>
-        {
-            template <typename T, typename V, typename Pred>
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                if (tid < 32)
-                {
-                    myData = sdata[tid];
-                    myVal = sval[tid];
-
-                    T reg = sdata[tid + 32];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 32];
-                    }
-
-                    reg = sdata[tid + 16];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 16];
-                    }
-
-                    reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 1];
-                    }
-                }
-            }
-        };
-        template <> struct PredValWarpReductor<32>
-        {
-            template <typename T, typename V, typename Pred>
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                if (tid < 16)
-                {
-                    myData = sdata[tid];
-                    myVal = sval[tid];
-
-                    T reg = sdata[tid + 16];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 16];
-                    }
-
-                    reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 1];
-                    }
-                }
-            }
-        };
-
-        template <> struct PredValWarpReductor<16>
-        {
-            template <typename T, typename V, typename Pred>
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                if (tid < 8)
-                {
-                    myData = sdata[tid];
-                    myVal = sval[tid];
-
-                    T reg = reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 1];
-                    }
-                }
-            }
-        };
-        template <> struct PredValWarpReductor<8>
-        {
-            template <typename T, typename V, typename Pred>
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                if (tid < 4)
-                {
-                    myData = sdata[tid];
-                    myVal = sval[tid];
-
-                    T reg = reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 1];
-                    }
-                }
-            }
-        };
-
-        template <bool warp> struct PredValReductionDispatcher;
-        template <> struct PredValReductionDispatcher<true>
-        {
-            template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);
-            }
-        };
-        template <> struct PredValReductionDispatcher<false>
-        {
-            template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                myData = sdata[tid];
-                myVal = sval[tid];
-
-                if (n >= 512 && tid < 256)
-                {
-                    T reg = sdata[tid + 256];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 256];
-                    }
-                    __syncthreads();
-                }
-                if (n >= 256 && tid < 128)
-                {
-                    T reg = sdata[tid + 128];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 128];
-                    }
-                    __syncthreads();
-                }
-                if (n >= 128 && tid < 64)
-                {
-                    T reg = sdata[tid + 64];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 64];
-                    }
-                    __syncthreads();
-                }
-
-                if (tid < 32)
-                {
-                    if (n >= 64)
-                    {
-                        T reg = sdata[tid + 32];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 32];
-                        }
-                    }
-                    if (n >= 32)
-                    {
-                        T reg = sdata[tid + 16];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 16];
-                        }
-                    }
-                    if (n >= 16)
-                    {
-                        T reg = sdata[tid + 8];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 8];
-                        }
-                    }
-                    if (n >= 8)
-                    {
-                        T reg = sdata[tid + 4];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 4];
-                        }
-                    }
-                    if (n >= 4)
-                    {
-                        T reg = sdata[tid + 2];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 2];
-                        }
-                    }
-                    if (n >= 2)
-                    {
-                        T reg = sdata[tid + 1];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 1];
-                        }
-                    }
-                }
-            }
-        };
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // PredVal2WarpReductor
-
-        template <int n> struct PredVal2WarpReductor;
-        template <> struct PredVal2WarpReductor<64>
-        {
-            template <typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                if (tid < 32)
-                {
-                    myData = sdata[tid];
-                    myVal1 = sval1[tid];
-                    myVal2 = sval2[tid];
-
-                    T reg = sdata[tid + 32];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 32];
-                        sval2[tid] = myVal2 = sval2[tid + 32];
-                    }
-
-                    reg = sdata[tid + 16];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 16];
-                        sval2[tid] = myVal2 = sval2[tid + 16];
-                    }
-
-                    reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 8];
-                        sval2[tid] = myVal2 = sval2[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 4];
-                        sval2[tid] = myVal2 = sval2[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 2];
-                        sval2[tid] = myVal2 = sval2[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 1];
-                        sval2[tid] = myVal2 = sval2[tid + 1];
-                    }
-                }
-            }
-        };
-        template <> struct PredVal2WarpReductor<32>
-        {
-            template <typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                if (tid < 16)
-                {
-                    myData = sdata[tid];
-                    myVal1 = sval1[tid];
-                    myVal2 = sval2[tid];
-
-                    T reg = sdata[tid + 16];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 16];
-                        sval2[tid] = myVal2 = sval2[tid + 16];
-                    }
-
-                    reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 8];
-                        sval2[tid] = myVal2 = sval2[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 4];
-                        sval2[tid] = myVal2 = sval2[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 2];
-                        sval2[tid] = myVal2 = sval2[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 1];
-                        sval2[tid] = myVal2 = sval2[tid + 1];
-                    }
-                }
-            }
-        };
-
-        template <> struct PredVal2WarpReductor<16>
-        {
-            template <typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                if (tid < 8)
-                {
-                    myData = sdata[tid];
-                    myVal1 = sval1[tid];
-                    myVal2 = sval2[tid];
-
-                    T reg = reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 8];
-                        sval2[tid] = myVal2 = sval2[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 4];
-                        sval2[tid] = myVal2 = sval2[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 2];
-                        sval2[tid] = myVal2 = sval2[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 1];
-                        sval2[tid] = myVal2 = sval2[tid + 1];
-                    }
-                }
-            }
-        };
-        template <> struct PredVal2WarpReductor<8>
-        {
-            template <typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                if (tid < 4)
-                {
-                    myData = sdata[tid];
-                    myVal1 = sval1[tid];
-                    myVal2 = sval2[tid];
-
-                    T reg = reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 4];
-                        sval2[tid] = myVal2 = sval2[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 2];
-                        sval2[tid] = myVal2 = sval2[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 1];
-                        sval2[tid] = myVal2 = sval2[tid + 1];
-                    }
-                }
-            }
-        };
-
-        template <bool warp> struct PredVal2ReductionDispatcher;
-        template <> struct PredVal2ReductionDispatcher<true>
-        {
-            template <int n, typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
-            }
-        };
-        template <> struct PredVal2ReductionDispatcher<false>
-        {
-            template <int n, typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                myData = sdata[tid];
-                myVal1 = sval1[tid];
-                myVal2 = sval2[tid];
-
-                if (n >= 512 && tid < 256)
-                {
-                    T reg = sdata[tid + 256];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 256];
-                        sval2[tid] = myVal2 = sval2[tid + 256];
-                    }
-                    __syncthreads();
-                }
-                if (n >= 256 && tid < 128)
-                {
-                    T reg = sdata[tid + 128];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 128];
-                        sval2[tid] = myVal2 = sval2[tid + 128];
-                    }
-                    __syncthreads();
-                }
-                if (n >= 128 && tid < 64)
-                {
-                    T reg = sdata[tid + 64];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 64];
-                        sval2[tid] = myVal2 = sval2[tid + 64];
-                    }
-                    __syncthreads();
-                }
-
-                if (tid < 32)
-                {
-                    if (n >= 64)
-                    {
-                        T reg = sdata[tid + 32];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 32];
-                            sval2[tid] = myVal2 = sval2[tid + 32];
-                        }
-                    }
-                    if (n >= 32)
-                    {
-                        T reg = sdata[tid + 16];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 16];
-                            sval2[tid] = myVal2 = sval2[tid + 16];
-                        }
-                    }
-                    if (n >= 16)
-                    {
-                        T reg = sdata[tid + 8];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 8];
-                            sval2[tid] = myVal2 = sval2[tid + 8];
-                        }
-                    }
-                    if (n >= 8)
-                    {
-                        T reg = sdata[tid + 4];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 4];
-                            sval2[tid] = myVal2 = sval2[tid + 4];
-                        }
-                    }
-                    if (n >= 4)
-                    {
-                        T reg = sdata[tid + 2];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 2];
-                            sval2[tid] = myVal2 = sval2[tid + 2];
-                        }
-                    }
-                    if (n >= 2)
-                    {
-                        T reg = sdata[tid + 1];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 1];
-                            sval2[tid] = myVal2 = sval2[tid + 1];
-                        }
-                    }
-                }
-            }
-        };
-    } // namespace utility_detail
-}}} // namespace cv { namespace gpu { namespace device
-
-#endif // __OPENCV_GPU_REDUCTION_DETAIL_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/emulation.hpp
@ -44,7 +44,6 @@
 #define OPENCV_GPU_EMULATION_HPP_

 #include "warp_reduce.hpp"
-#include <stdio.h>

 namespace cv { namespace gpu { namespace device
 {
--- a/modules/gpu/include/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp
@ -302,18 +302,18 @@ namespace cv { namespace gpu { namespace device
    template <> struct name<type> : binary_function<type, type, type> \
    { \
        __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
-        __device__ __forceinline__ name(const name& other):binary_function<type, type, type>(){}\
-        __device__ __forceinline__ name():binary_function<type, type, type>(){}\
+        __device__ __forceinline__ name() {}\
+        __device__ __forceinline__ name(const name&) {}\
    };

    template <typename T> struct maximum : binary_function<T, T, T>
    {
        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
        {
-            return lhs < rhs ? rhs : lhs;
+            return max(lhs, rhs);
        }
-        __device__ __forceinline__ maximum(const maximum& other):binary_function<T, T, T>(){}
-        __device__ __forceinline__ maximum():binary_function<T, T, T>(){}
+        __device__ __forceinline__ maximum() {}
+        __device__ __forceinline__ maximum(const maximum&) {}
    };

    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
@ -330,10 +330,10 @@ namespace cv { namespace gpu { namespace device
    {
        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
        {
-            return lhs < rhs ? lhs : rhs;
+            return min(lhs, rhs);
        }
-        __device__ __forceinline__ minimum(const minimum& other):binary_function<T, T, T>(){}
-        __device__ __forceinline__ minimum():binary_function<T, T, T>(){}
+        __device__ __forceinline__ minimum() {}
+        __device__ __forceinline__ minimum(const minimum&) {}
    };

    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
@ -350,6 +350,108 @@ namespace cv { namespace gpu { namespace device

    // Math functions
 ///bound=========================================
+
+    template <typename T> struct abs_func : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType x) const
+        {
+            return abs(x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
+    {
+        __device__ __forceinline__ unsigned char operator ()(unsigned char x) const
+        {
+            return x;
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<signed char> : unary_function<signed char, signed char>
+    {
+        __device__ __forceinline__ signed char operator ()(signed char x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<char> : unary_function<char, char>
+    {
+        __device__ __forceinline__ char operator ()(char x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
+    {
+        __device__ __forceinline__ unsigned short operator ()(unsigned short x) const
+        {
+            return x;
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<short> : unary_function<short, short>
+    {
+        __device__ __forceinline__ short operator ()(short x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
+    {
+        __device__ __forceinline__ unsigned int operator ()(unsigned int x) const
+        {
+            return x;
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<int> : unary_function<int, int>
+    {
+        __device__ __forceinline__ int operator ()(int x) const
+        {
+            return ::abs(x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<float> : unary_function<float, float>
+    {
+        __device__ __forceinline__ float operator ()(float x) const
+        {
+            return ::fabsf(x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<double> : unary_function<double, double>
+    {
+        __device__ __forceinline__ double operator ()(double x) const
+        {
+            return ::fabs(x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+
 #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
    template <typename T> struct name ## _func : unary_function<T, float> \
    { \
@ -357,6 +459,8 @@ namespace cv { namespace gpu { namespace device
        { \
            return func ## f(v); \
        } \
+        __device__ __forceinline__ name ## _func() {} \
+        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
    }; \
    template <> struct name ## _func<double> : unary_function<double, double> \
    { \
@ -364,6 +468,8 @@ namespace cv { namespace gpu { namespace device
        { \
            return func(v); \
        } \
+        __device__ __forceinline__ name ## _func() {} \
+        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
    };

 #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
@ -382,7 +488,6 @@ namespace cv { namespace gpu { namespace device
        } \
    };

-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)
    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
--- a/modules/gpu/include/opencv2/gpu/device/reduce.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/reduce.hpp
@ -0,0 +1,197 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_REDUCE_HPP__
+#define __OPENCV_GPU_REDUCE_HPP__
+
+#include <thrust/tuple.h>
+#include "detail/reduce.hpp"
+#include "detail/reduce_key_val.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    template <int N, typename T, class Op>
+    __device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
+    }
+    template <int N,
+              typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+              typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+              class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+    __device__ __forceinline__ void reduce(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                           const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                           unsigned int tid,
+                                           const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<
+                const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>&,
+                const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&,
+                const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op);
+    }
+
+    template <unsigned int N, typename K, typename V, class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+    template <unsigned int N,
+              typename K,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key,
+                                                 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&,
+                const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+                const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+                const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+    template <unsigned int N,
+              typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+              typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+    __device__ __forceinline__ void reduceKeyVal(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                                 const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid,
+                                                 const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<
+                const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>&,
+                const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&,
+                const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+                const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+                const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>&
+                >(skeys, key, svals, val, tid, cmp);
+    }
+
+    // smem_tuple
+
+    template <typename T0>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*>
+    smem_tuple(T0* t0)
+    {
+        return thrust::make_tuple((volatile T0*) t0);
+    }
+
+    template <typename T0, typename T1>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*>
+    smem_tuple(T0* t0, T1* t1)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1);
+    }
+
+    template <typename T0, typename T1, typename T2>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*>
+    smem_tuple(T0* t0, T1* t1, T2* t2)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*, volatile T9*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9);
+    }
+}}}
+
+#endif // __OPENCV_GPU_UTILITY_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp
@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device

    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
    {
-        return (uchar) ::max((int)v, 0);
-    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
-    {
-        return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
-    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
-    {
-        return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
-    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
-    {
-        return (uchar) ::min(v, (uint)UCHAR_MAX);
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
    {
-        return saturate_cast<uchar>((uint)v);
+        uint res = 0;
+        asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
    }
-
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
    {
-        int iv = __float2int_rn(v);
-        return saturate_cast<uchar>(iv);
+        uint res = 0;
+        asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<uchar>(iv);
+    #if __CUDA_ARCH__ >= 130
+        uint res = 0;
+        asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<uchar>((float)v);
    #endif
@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device

    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
    {
-        return (schar) ::min((int)v, SCHAR_MAX);
-    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
-    {
-        return (schar) ::min((uint)v, (uint)SCHAR_MAX);
-    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
-    {
-        return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
+        uint res = 0;
+        uint vi = v;
+        asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
    {
-        return saturate_cast<schar>((int)v);
+        uint res = 0;
+        asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
    {
-        return (schar) ::min(v, (uint)SCHAR_MAX);
+        uint res = 0;
+        asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
    }
-
    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
    {
-        int iv = __float2int_rn(v);
-        return saturate_cast<schar>(iv);
+        uint res = 0;
+        asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<schar>(iv);
+    #if __CUDA_ARCH__ >= 130
+        uint res = 0;
+        asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<schar>((float)v);
    #endif
@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device

    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
    {
-        return (ushort) ::max((int)v, 0);
+        ushort res = 0;
+        int vi = v;
+        asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
    {
-        return (ushort) ::max((int)v, 0);
+        ushort res = 0;
+        asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
    {
-        return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
+        ushort res = 0;
+        asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
    {
-        return (ushort) ::min(v, (uint)USHRT_MAX);
+        ushort res = 0;
+        asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
    {
-        int iv = __float2int_rn(v);
-        return saturate_cast<ushort>(iv);
+        ushort res = 0;
+        asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<ushort>(iv);
+    #if __CUDA_ARCH__ >= 130
+        ushort res = 0;
+        asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<ushort>((float)v);
    #endif
@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device

    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
    {
-        return (short) ::min((int)v, SHRT_MAX);
+        short res = 0;
+        asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(int v)
    {
-        return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
+        short res = 0;
+        asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
    {
-        return (short) ::min(v, (uint)SHRT_MAX);
+        short res = 0;
+        asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(float v)
    {
-        int iv = __float2int_rn(v);
-        return saturate_cast<short>(iv);
+        short res = 0;
+        asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<short>(iv);
+    #if __CUDA_ARCH__ >= 130
+        short res = 0;
+        asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<short>((float)v);
    #endif
    }

+    template<> __device__ __forceinline__ int saturate_cast<int>(uint v)
+    {
+        int res = 0;
+        asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
    template<> __device__ __forceinline__ int saturate_cast<int>(float v)
    {
        return __float2int_rn(v);
@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device
    #endif
    }

+    template<> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
+    {
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
    {
        return __float2uint_rn(v);
--- a/modules/gpu/include/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp
@ -45,7 +45,6 @@

 #include "saturate_cast.hpp"
 #include "datamov_utils.hpp"
-#include "detail/reduction_detail.hpp"

 namespace cv { namespace gpu { namespace device
 {
@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device
        }
    };

-    ///////////////////////////////////////////////////////////////////////////////
-    // Reduction
-
-    template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
-    }
-
-    template <int n, typename T, typename V, typename Pred>
-    __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
-    }
-
-    template <int n, typename T, typename V1, typename V2, typename Pred>
-    __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
-    }
-
    ///////////////////////////////////////////////////////////////////////////////
    // Solve linear system

--- a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
 #define __OPENCV_GPU_VEC_DISTANCE_HPP__

-#include "utility.hpp"
+#include "reduce.hpp"
 #include "functional.hpp"
 #include "detail/vec_distance_detail.hpp"

@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device

        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
        }

        __device__ __forceinline__ operator int() const
@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device

        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
        }

        __device__ __forceinline__ operator float() const
@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device

        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
        }

        __device__ __forceinline__ operator float() const
@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device

        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
        }

        __device__ __forceinline__ operator int() const
--- a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
@ -280,7 +280,7 @@ namespace cv { namespace gpu { namespace device
    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \
    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \
    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, abs, abs_func) \
    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \
    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \
    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \
--- a/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
@ -0,0 +1,145 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
+#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T>
+    __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl(val, srcLane, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl((int) val, srcLane, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl(lo, srcLane, width);
+        hi = __shfl(hi, srcLane, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl_down(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_down((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl_down(lo, delta, width);
+        hi = __shfl_down(hi, delta, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl_up(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_up((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl_up(lo, delta, width);
+        hi = __shfl_up(hi, delta, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+}}}
+
+#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
--- a/modules/gpu/misc/carma.toolchain.cmake
+++ b/modules/gpu/misc/carma.toolchain.cmake
@ -0,0 +1,26 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_VERSION 1)
+set(CMAKE_SYSTEM_PROCESSOR arm)
+
+set(CMAKE_C_COMPILER    arm-linux-gnueabi-gcc-4.5)
+set(CMAKE_CXX_COMPILER  arm-linux-gnueabi-g++-4.5)
+
+#suppress compiller varning
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-psabi" )
+set( CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -Wno-psabi" )
+
+# can be any other plases
+set(__arm_linux_eabi_root /usr/arm-linux-gnueabi)
+
+set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${__arm_linux_eabi_root})
+
+if(EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
+    set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${CUDA_TOOLKIT_ROOT_DIR})
+endif()
+
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+
+set(CARMA 1)
+add_definitions(-DCARMA)
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@ -581,13 +581,12 @@ PERF_TEST_P(Sz, ImgProc_CalcHist, GPU_TYPICAL_MAT_SIZES)
    {
        cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat d_hist;
-        cv::gpu::GpuMat d_buf;

-        cv::gpu::calcHist(d_src, d_hist, d_buf);
+        cv::gpu::calcHist(d_src, d_hist);

        TEST_CYCLE()
        {
-            cv::gpu::calcHist(d_src, d_hist, d_buf);
+            cv::gpu::calcHist(d_src, d_hist);
        }

        GPU_SANITY_CHECK(d_hist);
@ -1512,13 +1511,13 @@ PERF_TEST_P(Sz_Depth_Code, ImgProc_CvtColor, Combine(
           CvtColorInfo(3, 3, cv::COLOR_BGR2HLS),
           CvtColorInfo(3, 3, cv::COLOR_HLS2BGR),
           CvtColorInfo(3, 3, cv::COLOR_BGR2Lab),
-           CvtColorInfo(3, 3, cv::COLOR_RGB2Lab),
+           CvtColorInfo(3, 3, cv::COLOR_LBGR2Lab),
           CvtColorInfo(3, 3, cv::COLOR_BGR2Luv),
-           CvtColorInfo(3, 3, cv::COLOR_RGB2Luv),
+           CvtColorInfo(3, 3, cv::COLOR_LBGR2Luv),
           CvtColorInfo(3, 3, cv::COLOR_Lab2BGR),
-           CvtColorInfo(3, 3, cv::COLOR_Lab2RGB),
-           CvtColorInfo(3, 3, cv::COLOR_Luv2BGR),
+           CvtColorInfo(3, 3, cv::COLOR_Lab2LBGR),
           CvtColorInfo(3, 3, cv::COLOR_Luv2RGB),
+           CvtColorInfo(3, 3, cv::COLOR_Luv2LRGB),
           CvtColorInfo(1, 3, cv::COLOR_BayerBG2BGR),
           CvtColorInfo(1, 3, cv::COLOR_BayerGB2BGR),
           CvtColorInfo(1, 3, cv::COLOR_BayerRG2BGR),
@ -1706,10 +1705,30 @@ PERF_TEST_P(Sz_Depth_Cn, ImgProc_ImagePyramidGetLayer, Combine(GPU_TYPICAL_MAT_S
    }
 }

+namespace {
+    struct Vec3fComparator
+    {
+        bool operator()(const cv::Vec3f& a, const cv::Vec3f b) const
+        {
+            if(a[0] != b[0]) return a[0] < b[0];
+            else if(a[1] != b[1]) return a[1] < b[1];
+            else return a[2] < b[2];
+        }
+    };
+    struct Vec2fComparator
+    {
+        bool operator()(const cv::Vec2f& a, const cv::Vec2f b) const
+        {
+            if(a[0] != b[0]) return a[0] < b[0];
+            else return a[1] < b[1];
+        }
+    };
+}
+
 //////////////////////////////////////////////////////////////////////
 // HoughLines

-PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
+PERF_TEST_P(Sz, ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
 {
    declare.time(30.0);

@ -1744,7 +1763,11 @@ PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
            cv::gpu::HoughLines(d_src, d_lines, d_buf, rho, theta, threshold);
        }

-        GPU_SANITY_CHECK(d_lines);
+        cv::Mat h_lines(d_lines);
+        cv::Vec2f* begin = (cv::Vec2f*)(h_lines.ptr<char>(0));
+        cv::Vec2f* end = (cv::Vec2f*)(h_lines.ptr<char>(0) + (h_lines.cols) * 2 * sizeof(float));
+        std::sort(begin, end, Vec2fComparator());
+        SANITY_CHECK(h_lines);
    }
    else
    {
@ -1756,7 +1779,8 @@ PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
            cv::HoughLines(src, lines, rho, theta, threshold);
        }

-        CPU_SANITY_CHECK(lines);
+        std::sort(lines.begin(), lines.end(), Vec2fComparator());
+        SANITY_CHECK(lines);
    }
 }

@ -1804,7 +1828,11 @@ PERF_TEST_P(Sz_Dp_MinDist, ImgProc_HoughCircles, Combine(GPU_TYPICAL_MAT_SIZES,
            cv::gpu::HoughCircles(d_src, d_circles, d_buf, CV_HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
        }

-        GPU_SANITY_CHECK(d_circles);
+        cv::Mat h_circles(d_circles);
+        cv::Vec3f* begin = (cv::Vec3f*)(h_circles.ptr<char>(0));
+        cv::Vec3f* end = (cv::Vec3f*)(h_circles.ptr<char>(0) + (h_circles.cols) * 3 * sizeof(float));
+        std::sort(begin, end, Vec3fComparator());
+        SANITY_CHECK(h_circles);
    }
    else
    {
@ -1817,7 +1845,8 @@ PERF_TEST_P(Sz_Dp_MinDist, ImgProc_HoughCircles, Combine(GPU_TYPICAL_MAT_SIZES,
            cv::HoughCircles(src, circles, CV_HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
        }

-        CPU_SANITY_CHECK(circles);
+        std::sort(circles.begin(), circles.end(), Vec3fComparator());
+        SANITY_CHECK(circles);
    }
 }

--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@ -89,7 +89,6 @@ PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gp
    SANITY_CHECK(found_locations);
 }

-
 ///////////////////////////////////////////////////////////////
 // HaarClassifier

--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@ -68,11 +68,16 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool,
 void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
 {
 #ifndef HAVE_CUBLAS
-    (void)src1; (void)src2; (void)alpha; (void)src3; (void)beta; (void)dst; (void)flags; (void)stream;
+    (void)src1;
+    (void)src2;
+    (void)alpha;
+    (void)src3;
+    (void)beta;
+    (void)dst;
+    (void)flags;
+    (void)stream;
    CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS");
-
 #else
-
    // CUBLAS works with column-major matrices

    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
@ -80,7 +85,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G

    if (src1.depth() == CV_64F)
    {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }

@ -188,7 +193,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
    }

    cublasSafeCall( cublasDestroy_v2(handle) );
-
 #endif
 }

@ -227,7 +231,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
    }
    else // if (src.elemSize() == 8)
    {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");

        NppStStreamHandler h(stream);
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@ -88,71 +88,71 @@ namespace cv { namespace gpu { namespace device
    {
        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
    }

    namespace bf_knnmatch
    {
        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);

        template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
    }

    namespace bf_radius_match
    {
        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
    }
 }}}

@ -198,11 +198,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
    if (query.empty() || train.empty())
        return;

-    using namespace ::cv::gpu::device::bf_match;
+    using namespace cv::gpu::device::bf_match;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
@ -234,10 +234,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    func(query, train, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream));
+    func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream));
 }

 void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector<DMatch>& matches)
@ -268,14 +265,14 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
    const float* distance_ptr =  distance.ptr<float>();
    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)
    {
-        int _trainIdx = *trainIdx_ptr;
+        int train_idx = *trainIdx_ptr;

-        if (_trainIdx == -1)
+        if (train_idx == -1)
            continue;

-        float _distance = *distance_ptr;
+        float distance_local = *distance_ptr;

-        DMatch m(queryIdx, _trainIdx, 0, _distance);
+        DMatch m(queryIdx, train_idx, 0, distance_local);

        matches.push_back(m);
    }
@ -340,11 +337,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
    if (query.empty() || trainCollection.empty())
        return;

-    using namespace ::cv::gpu::device::bf_match;
+    using namespace cv::gpu::device::bf_match;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
@ -376,10 +373,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    func(query, trainCollection, masks, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
+    func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
 }

 void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
@ -413,16 +407,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
    const float* distance_ptr =  distance.ptr<float>();
    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
    {
-        int trainIdx = *trainIdx_ptr;
+        int _trainIdx = *trainIdx_ptr;

-        if (trainIdx == -1)
+        if (_trainIdx == -1)
            continue;

-        int imgIdx = *imgIdx_ptr;
+        int _imgIdx = *imgIdx_ptr;

-        float distance = *distance_ptr;
+        float _distance = *distance_ptr;

-        DMatch m(queryIdx, trainIdx, imgIdx, distance);
+        DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);

        matches.push_back(m);
    }
@ -451,11 +445,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
    if (query.empty() || train.empty())
        return;

-    using namespace ::cv::gpu::device::bf_knnmatch;
+    using namespace cv::gpu::device::bf_knnmatch;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
                             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
@ -502,10 +496,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream));
+    func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
 }

 void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
@ -548,13 +539,13 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, c

        for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
        {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;

-            if (trainIdx != -1)
+            if (_trainIdx != -1)
            {
-                float distance = *distance_ptr;
+                float _distance = *distance_ptr;

-                DMatch m(queryIdx, trainIdx, 0, distance);
+                DMatch m(queryIdx, _trainIdx, 0, _distance);

                curMatches.push_back(m);
            }
@ -580,11 +571,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
    if (query.empty() || trainCollection.empty())
        return;

-    using namespace ::cv::gpu::device::bf_knnmatch;
+    using namespace cv::gpu::device::bf_knnmatch;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
@ -621,10 +612,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
+    func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
 }

 void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
@ -667,15 +655,15 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Convert(const Mat& trainIdx,

        for (int i = 0; i < 2; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
        {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;

-            if (trainIdx != -1)
+             if (_trainIdx != -1)
            {
-                int imgIdx = *imgIdx_ptr;
+                int _imgIdx = *imgIdx_ptr;

-                float distance = *distance_ptr;
+                float _distance = *distance_ptr;

-                DMatch m(queryIdx, trainIdx, imgIdx, distance);
+                DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);

                curMatches.push_back(m);
            }
@ -765,7 +753,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
@ -786,12 +774,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
        }
    };

-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
    const int nQuery = query.rows;
    const int nTrain = train.rows;

@ -814,7 +796,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

-    func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
+    func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream));
 }

 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
@ -852,25 +834,25 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
        const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
        const float* distance_ptr = distance.ptr<float>(queryIdx);

-        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
+        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);

-        if (nMatches == 0)
+        if (nMatched == 0)
        {
            if (!compactResult)
                matches.push_back(vector<DMatch>());
            continue;
        }

-        matches.push_back(vector<DMatch>(nMatches));
+        matches.push_back(vector<DMatch>(nMatched));
        vector<DMatch>& curMatches = matches.back();

-        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)
+        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++distance_ptr)
        {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;

-            float distance = *distance_ptr;
+            float _distance = *distance_ptr;

-            DMatch m(queryIdx, trainIdx, 0, distance);
+            DMatch m(queryIdx, _trainIdx, 0, _distance);

            curMatches[i] = m;
        }
@ -897,7 +879,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
@ -918,12 +900,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
        }
    };

-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
    const int nQuery = query.rows;

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
@ -949,7 +925,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
    vector<PtrStepSzb> masks_(masks.begin(), masks.end());

    func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
-        trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
+        trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
 }

 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
@ -990,9 +966,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
        const int* imgIdx_ptr = imgIdx.ptr<int>(queryIdx);
        const float* distance_ptr = distance.ptr<float>(queryIdx);

-        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
+        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);

-        if (nMatches == 0)
+        if (nMatched == 0)
        {
            if (!compactResult)
                matches.push_back(vector<DMatch>());
@ -1001,9 +977,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx

        matches.push_back(vector<DMatch>());
        vector<DMatch>& curMatches = matches.back();
-        curMatches.reserve(nMatches);
+        curMatches.reserve(nMatched);

-        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
+        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
        {
            int _trainIdx = *trainIdx_ptr;
            int _imgIdx = *imgIdx_ptr;
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -622,7 +622,7 @@ private:
        }

        // copy data structures on gpu
-        stage_mat.upload(cv::Mat(1, stages.size() * sizeof(Stage), CV_8UC1, (uchar*)&(stages[0]) ));
+        stage_mat.upload(cv::Mat(1, (int) (stages.size() * sizeof(Stage)), CV_8UC1, (uchar*)&(stages[0]) ));
        trees_mat.upload(cv::Mat(cl_trees).reshape(1,1));
        nodes_mat.upload(cv::Mat(cl_nodes).reshape(1,1));
        leaves_mat.upload(cv::Mat(cl_leaves).reshape(1,1));
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@ -53,7 +53,7 @@ void cv::gpu::gammaCorrection(const GpuMat&, GpuMat&, bool, Stream&) { throw_nog

 #else /* !defined (HAVE_CUDA) */

-#include <cvt_colot_internal.h>
+#include "cvt_color_internal.h"

 namespace cv { namespace gpu {
    namespace device
@ -69,7 +69,7 @@ using namespace ::cv::gpu::device;

 namespace
 {
-    typedef void (*gpu_func_t)(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+    typedef void (*gpu_func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

    void bgr_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
@ -1155,154 +1155,420 @@ namespace
        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
    }

-    void bgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& st)
+    void bgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        #if (CUDA_VERSION < 5000)
-            (void)src;
-            (void)dst;
-            (void)dcn;
-            (void)st;
-            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
-        #else
-            CV_Assert(src.depth() == CV_8U);
-            CV_Assert(src.channels() == 3);
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {bgr_to_lab_8u, bgr_to_lab_32f},
+                {bgra_to_lab_8u, bgra_to_lab_32f}
+            },
+            {
+                {bgr_to_lab4_8u, bgr_to_lab4_32f},
+                {bgra_to_lab4_8u, bgra_to_lab4_32f}
+            }
+        };

-            dcn = src.channels();
+        if (dcn <= 0) dcn = 3;

-            dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);

-            cudaStream_t stream = StreamAccessor::getStream(st);
-            NppStreamHandler h(stream);
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));

-            NppiSize oSizeROI;
-            oSizeROI.width = src.cols;
-            oSizeROI.height = src.rows;
-
-            nppSafeCall( nppiBGRToLab_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        #endif
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
    }

-    void rgb_to_lab(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    void rgb_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        bgr_to_rgb(src, dst, -1, stream);
-        bgr_to_lab(dst, dst, -1, stream);
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {rgb_to_lab_8u, rgb_to_lab_32f},
+                {rgba_to_lab_8u, rgba_to_lab_32f}
+            },
+            {
+                {rgb_to_lab4_8u, rgb_to_lab4_32f},
+                {rgba_to_lab4_8u, rgba_to_lab4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
    }

-    void lab_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& st)
+    void lbgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        #if (CUDA_VERSION < 5000)
-            (void)src;
-            (void)dst;
-            (void)dcn;
-            (void)st;
-            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
-        #else
-            CV_Assert(src.depth() == CV_8U);
-            CV_Assert(src.channels() == 3);
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lbgr_to_lab_8u, lbgr_to_lab_32f},
+                {lbgra_to_lab_8u, lbgra_to_lab_32f}
+            },
+            {
+                {lbgr_to_lab4_8u, lbgr_to_lab4_32f},
+                {lbgra_to_lab4_8u, lbgra_to_lab4_32f}
+            }
+        };

-            dcn = src.channels();
+        if (dcn <= 0) dcn = 3;

-            dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);

-            cudaStream_t stream = StreamAccessor::getStream(st);
-            NppStreamHandler h(stream);
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));

-            NppiSize oSizeROI;
-            oSizeROI.width = src.cols;
-            oSizeROI.height = src.rows;
-
-            nppSafeCall( nppiLabToBGR_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        #endif
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
    }

-    void lab_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    void lrgb_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        lab_to_bgr(src, dst, -1, stream);
-        bgr_to_rgb(dst, dst, -1, stream);
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lrgb_to_lab_8u, lrgb_to_lab_32f},
+                {lrgba_to_lab_8u, lrgba_to_lab_32f}
+            },
+            {
+                {lrgb_to_lab4_8u, lrgb_to_lab4_32f},
+                {lrgba_to_lab4_8u, lrgba_to_lab4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
    }

-    void rgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& st)
+    void lab_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        #if (CUDA_VERSION < 5000)
-            (void)src;
-            (void)dst;
-            (void)dcn;
-            (void)st;
-            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
-        #else
-            CV_Assert(src.depth() == CV_8U);
-            CV_Assert(src.channels() == 3 || src.channels() == 4);
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lab_to_bgr_8u, lab_to_bgr_32f},
+                {lab4_to_bgr_8u, lab4_to_bgr_32f}
+            },
+            {
+                {lab_to_bgra_8u, lab_to_bgra_32f},
+                {lab4_to_bgra_8u, lab4_to_bgra_32f}
+            }
+        };

-            dcn = src.channels();
+        if (dcn <= 0) dcn = 3;

-            dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);

-            cudaStream_t stream = StreamAccessor::getStream(st);
-            NppStreamHandler h(stream);
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));

-            NppiSize oSizeROI;
-            oSizeROI.width = src.cols;
-            oSizeROI.height = src.rows;
-
-            if (dcn == 3)
-                nppSafeCall( nppiRGBToLUV_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
-            else
-                nppSafeCall( nppiRGBToLUV_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        #endif
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
    }

-    void bgr_to_luv(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    void lab_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        bgr_to_rgb(src, dst, -1, stream);
-        rgb_to_luv(dst, dst, -1, stream);
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lab_to_rgb_8u, lab_to_rgb_32f},
+                {lab4_to_rgb_8u, lab4_to_rgb_32f}
+            },
+            {
+                {lab_to_rgba_8u, lab_to_rgba_32f},
+                {lab4_to_rgba_8u, lab4_to_rgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
    }

-    void luv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& st)
+    void lab_to_lbgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        #if (CUDA_VERSION < 5000)
-            (void)src;
-            (void)dst;
-            (void)dcn;
-            (void)st;
-            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
-        #else
-            CV_Assert(src.depth() == CV_8U);
-            CV_Assert(src.channels() == 3 || src.channels() == 4);
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lab_to_lbgr_8u, lab_to_lbgr_32f},
+                {lab4_to_lbgr_8u, lab4_to_lbgr_32f}
+            },
+            {
+                {lab_to_lbgra_8u, lab_to_lbgra_32f},
+                {lab4_to_lbgra_8u, lab4_to_lbgra_32f}
+            }
+        };

-            dcn = src.channels();
+        if (dcn <= 0) dcn = 3;

-            dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);

-            cudaStream_t stream = StreamAccessor::getStream(st);
-            NppStreamHandler h(stream);
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));

-            NppiSize oSizeROI;
-            oSizeROI.width = src.cols;
-            oSizeROI.height = src.rows;
-
-            if (dcn == 3)
-                nppSafeCall( nppiLUVToRGB_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
-            else
-                nppSafeCall( nppiLUVToRGB_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        #endif
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
    }

-    void luv_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    void lab_to_lrgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
-        luv_to_rgb(src, dst, -1, stream);
-        bgr_to_rgb(dst, dst, -1, stream);
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lab_to_lrgb_8u, lab_to_lrgb_32f},
+                {lab4_to_lrgb_8u, lab4_to_lrgb_32f}
+            },
+            {
+                {lab_to_lrgba_8u, lab_to_lrgba_32f},
+                {lab4_to_lrgba_8u, lab4_to_lrgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {bgr_to_luv_8u, bgr_to_luv_32f},
+                {bgra_to_luv_8u, bgra_to_luv_32f}
+            },
+            {
+                {bgr_to_luv4_8u, bgr_to_luv4_32f},
+                {bgra_to_luv4_8u, bgra_to_luv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {rgb_to_luv_8u, rgb_to_luv_32f},
+                {rgba_to_luv_8u, rgba_to_luv_32f}
+            },
+            {
+                {rgb_to_luv4_8u, rgb_to_luv4_32f},
+                {rgba_to_luv4_8u, rgba_to_luv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void lbgr_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lbgr_to_luv_8u, lbgr_to_luv_32f},
+                {lbgra_to_luv_8u, lbgra_to_luv_32f}
+            },
+            {
+                {lbgr_to_luv4_8u, lbgr_to_luv4_32f},
+                {lbgra_to_luv4_8u, lbgra_to_luv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void lrgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {lrgb_to_luv_8u, lrgb_to_luv_32f},
+                {lrgba_to_luv_8u, lrgba_to_luv_32f}
+            },
+            {
+                {lrgb_to_luv4_8u, lrgb_to_luv4_32f},
+                {lrgba_to_luv4_8u, lrgba_to_luv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void luv_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {luv_to_bgr_8u, luv_to_bgr_32f},
+                {luv4_to_bgr_8u, luv4_to_bgr_32f}
+            },
+            {
+                {luv_to_bgra_8u, luv_to_bgra_32f},
+                {luv4_to_bgra_8u, luv4_to_bgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void luv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {luv_to_rgb_8u, luv_to_rgb_32f},
+                {luv4_to_rgb_8u, luv4_to_rgb_32f}
+            },
+            {
+                {luv_to_rgba_8u, luv_to_rgba_32f},
+                {luv4_to_rgba_8u, luv4_to_rgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void luv_to_lbgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {luv_to_lbgr_8u, luv_to_lbgr_32f},
+                {luv4_to_lbgr_8u, luv4_to_lbgr_32f}
+            },
+            {
+                {luv_to_lbgra_8u, luv_to_lbgra_32f},
+                {luv4_to_lbgra_8u, luv4_to_lbgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void luv_to_lrgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {luv_to_lrgb_8u, luv_to_lrgb_32f},
+                {luv4_to_lrgb_8u, luv4_to_lrgb_32f}
+            },
+            {
+                {luv_to_lrgba_8u, luv_to_lrgba_32f},
+                {luv4_to_lrgba_8u, luv4_to_lrgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, StreamAccessor::getStream(stream));
    }

    void rgba_to_mbgra(const GpuMat& src, GpuMat& dst, int, Stream& st)
@ -1475,15 +1741,15 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
        hls_to_bgr_full,        // CV_HLS2BGR_FULL = 72
        hls_to_rgb_full,        // CV_HLS2RGB_FULL = 73

-        0,                      // CV_LBGR2Lab     = 74
-        0,                      // CV_LRGB2Lab     = 75
-        0,                      // CV_LBGR2Luv     = 76
-        0,                      // CV_LRGB2Luv     = 77
+        lbgr_to_lab,            // CV_LBGR2Lab     = 74
+        lrgb_to_lab,            // CV_LRGB2Lab     = 75
+        lbgr_to_luv,            // CV_LBGR2Luv     = 76
+        lrgb_to_luv,            // CV_LRGB2Luv     = 77

-        0,                      // CV_Lab2LBGR     = 78
-        0,                      // CV_Lab2LRGB     = 79
-        0,                      // CV_Luv2LBGR     = 80
-        0,                      // CV_Luv2LRGB     = 81
+        lab_to_lbgr,            // CV_Lab2LBGR     = 78
+        lab_to_lrgb,            // CV_Lab2LRGB     = 79
+        luv_to_lbgr,            // CV_Luv2LBGR     = 80
+        luv_to_lrgb,            // CV_Luv2LRGB     = 81

        bgr_to_yuv,             // CV_BGR2YUV      = 82
        rgb_to_yuv,             // CV_RGB2YUV      = 83
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@ -42,10 +42,13 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"

 namespace cv { namespace gpu { namespace device
 {
@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device
                                      int& bestTrainIdx1, int& bestTrainIdx2,
                                      float* s_distance, int* s_trainIdx)
        {
+        #if __CUDA_ARCH__ >= 300
+            (void) s_distance;
+            (void) s_trainIdx;
+
+            float d1, d2;
+            int i1, i2;
+
+            #pragma unroll
+            for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+            {
+                d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+                d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+                i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+                i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+
+                if (bestDistance1 < d1)
+                {
+                    if (d1 < bestDistance2)
+                    {
+                        bestDistance2 = d1;
+                        bestTrainIdx2 = i1;
+                    }
+                }
+                else
+                {
+                    bestDistance2 = bestDistance1;
+                    bestTrainIdx2 = bestTrainIdx1;
+
+                    bestDistance1 = d1;
+                    bestTrainIdx1 = i1;
+
+                    if (d2 < bestDistance2)
+                    {
+                        bestDistance2 = d2;
+                        bestTrainIdx2 = i2;
+                    }
+                }
+            }
+        #else
            float myBestDistance1 = numeric_limits<float>::max();
            float myBestDistance2 = numeric_limits<float>::max();
            int myBestTrainIdx1 = -1;
@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device

            bestTrainIdx1 = myBestTrainIdx1;
            bestTrainIdx2 = myBestTrainIdx2;
+        #endif
        }

        template <int BLOCK_SIZE>
@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device
                                       int& bestImgIdx1, int& bestImgIdx2,
                                       float* s_distance, int* s_trainIdx, int* s_imgIdx)
        {
+        #if __CUDA_ARCH__ >= 300
+            (void) s_distance;
+            (void) s_trainIdx;
+            (void) s_imgIdx;
+
+            float d1, d2;
+            int i1, i2;
+            int j1, j2;
+
+            #pragma unroll
+            for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+            {
+                d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+                d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+                i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+                i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+                j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
+                j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
+
+                if (bestDistance1 < d1)
+                {
+                    if (d1 < bestDistance2)
+                    {
+                        bestDistance2 = d1;
+                        bestTrainIdx2 = i1;
+                        bestImgIdx2 = j1;
+                    }
+                }
+                else
+                {
+                    bestDistance2 = bestDistance1;
+                    bestTrainIdx2 = bestTrainIdx1;
+                    bestImgIdx2 = bestImgIdx1;
+
+                    bestDistance1 = d1;
+                    bestTrainIdx1 = i1;
+                    bestImgIdx1 = j1;
+
+                    if (d2 < bestDistance2)
+                    {
+                        bestDistance2 = d2;
+                        bestTrainIdx2 = i2;
+                        bestImgIdx2 = j2;
+                    }
+                }
+            }
+        #else
            float myBestDistance1 = numeric_limits<float>::max();
            float myBestDistance2 = numeric_limits<float>::max();
            int myBestTrainIdx1 = -1;
@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device

            bestImgIdx1 = myBestImgIdx1;
            bestImgIdx2 = myBestImgIdx2;
+        #endif
        }

        ///////////////////////////////////////////////////////////////////////////////
@ -748,9 +839,8 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T, typename Mask>
        void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
                              const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
-                              int cc, cudaStream_t stream)
+                              cudaStream_t stream)
        {
-            (void)cc;
            if (query.cols <= 64)
            {
                matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
@ -780,9 +870,8 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T, typename Mask>
        void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
                              const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-                              int cc, cudaStream_t stream)
+                              cudaStream_t stream)
        {
-            (void)cc;
            if (query.cols <= 64)
            {
                matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
@ -945,9 +1034,8 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T, typename Mask>
        void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
                                    const PtrStepSzf& allDist,
-                                    int cc, cudaStream_t stream)
+                                    cudaStream_t stream)
        {
-            (void)cc;
            if (query.cols <= 64)
            {
                calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
@ -1005,7 +1093,7 @@ namespace cv { namespace gpu { namespace device
            s_trainIdx[threadIdx.x] = bestIdx;
            __syncthreads();

-            reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
+            reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());

            if (threadIdx.x == 0)
            {
@ -1034,7 +1122,7 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream)
+        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream)
        {
            findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
        }
@ -1045,16 +1133,16 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T, typename Mask>
        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (k == 2)
            {
-                match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
+                match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, stream);
            }
            else
            {
-                calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
-                findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
+                calcDistanceDispatcher<Dist>(query, train, mask, allDist, stream);
+                findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
            }
        }

@ -1063,103 +1151,103 @@ namespace cv { namespace gpu { namespace device

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (mask.data)
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
            else
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
        }

-        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (mask.data)
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
            else
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
        }

-        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (mask.data)
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
            else
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
        }

-        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);

        template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (masks.data)
-                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
            else
-                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance,  stream);
        }

-        template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);

        template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (masks.data)
-                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
            else
-                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
        }

-        //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);

        template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (masks.data)
-                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
            else
-                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
        }

-        template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
    } // namespace bf_knnmatch
 }}} // namespace cv { namespace gpu { namespace device {

--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@ -42,7 +42,9 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device
            s_distance += threadIdx.y * BLOCK_SIZE;
            s_trainIdx += threadIdx.y * BLOCK_SIZE;

-            s_distance[threadIdx.x] = bestDistance;
-            s_trainIdx[threadIdx.x] = bestTrainIdx;
-
-            __syncthreads();
-
-            reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
+            reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
        }

        template <int BLOCK_SIZE>
@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device
            s_trainIdx += threadIdx.y * BLOCK_SIZE;
            s_imgIdx   += threadIdx.y * BLOCK_SIZE;

-            s_distance[threadIdx.x] = bestDistance;
-            s_trainIdx[threadIdx.x] = bestTrainIdx;
-            s_imgIdx  [threadIdx.x] = bestImgIdx;
-
-            __syncthreads();
-
-            reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());
+            reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
        }

        ///////////////////////////////////////////////////////////////////////////////
@ -567,9 +558,8 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T, typename Mask>
        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                             int cc, cudaStream_t stream)
+                             cudaStream_t stream)
        {
-            (void)cc;
            if (query.cols <= 64)
            {
                matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
@ -599,9 +589,8 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T, typename Mask>
        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                             int cc, cudaStream_t stream)
+                             cudaStream_t stream)
        {
-            (void)cc;
            if (query.cols <= 64)
            {
                matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
@ -633,151 +622,151 @@ namespace cv { namespace gpu { namespace device

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                                               const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                                               int cc, cudaStream_t stream)
+                                               cudaStream_t stream)
        {
            if (mask.data)
            {
                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
                    trainIdx, distance,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
                    trainIdx, distance,
-                    cc, stream);
+                    stream);
            }
        }

-        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                                               const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                                               int cc, cudaStream_t stream)
+                                               cudaStream_t stream)
        {
            if (mask.data)
            {
                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
                    trainIdx, distance,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
                    trainIdx, distance,
-                    cc, stream);
+                    stream);
            }
        }

-        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                                                    const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                                                    int cc, cudaStream_t stream)
+                                                    cudaStream_t stream)
        {
            if (mask.data)
            {
                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
                    trainIdx, distance,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
                    trainIdx, distance,
-                    cc, stream);
+                    stream);
            }
        }

-        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                                               int cc, cudaStream_t stream)
+                                                cudaStream_t stream)
        {
            if (masks.data)
            {
                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
                    trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
                    trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
            }
        }

-        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                                               int cc, cudaStream_t stream)
+                                               cudaStream_t stream)
        {
            if (masks.data)
            {
                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
                    trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
                    trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
            }
        }

-        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                                    const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                                                    int cc, cudaStream_t stream)
+                                                    cudaStream_t stream)
        {
            if (masks.data)
            {
                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
                    trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
                    trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
            }
        }

-        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    } // namespace bf_match
 }}} // namespace cv { namespace gpu { namespace device {

--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@ -42,7 +42,8 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device
        __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-
            extern __shared__ int smem[];

            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device
                    bestDistance.ptr(queryIdx)[ind] = distVal;
                }
            }
-
-            #endif
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device
        __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-
            extern __shared__ int smem[];

            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device
                    bestDistance.ptr(queryIdx)[ind] = distVal;
                }
            }
-
-            #endif
        }

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
@ -281,9 +274,8 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T, typename Mask>
        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream)
+                             cudaStream_t stream)
        {
-            (void)cc;
            if (query.cols <= 64)
            {
                matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
@ -313,9 +305,8 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T>
        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream)
+                             cudaStream_t stream)
        {
-            (void)cc;
            if (query.cols <= 64)
            {
                matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
@ -347,124 +338,124 @@ namespace cv { namespace gpu { namespace device

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (mask.data)
            {
                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
                    trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
                    trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
            }
        }

-        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (mask.data)
            {
                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
                    trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
                    trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
            }
        }

-        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (mask.data)
            {
                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
                    trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
                    trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
            }
        }

-        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
                trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
+                stream);
        }

-        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
                trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
+                stream);
        }

-        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
                trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
+                stream);
        }

-        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
    } // namespace bf_radius_match
 }}} // namespace cv { namespace gpu { namespace device

--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@ -42,9 +42,10 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/reduce.hpp"

 namespace cv { namespace gpu { namespace device
 {
@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device
                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
            }
+            __device__ __forceinline__ TransformOp() {}
+            __device__ __forceinline__ TransformOp(const TransformOp&) {}
        };

        void call(const PtrStepSz<float3> src, const float* rot,
@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device
                        (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
                        (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
            }
+            __device__ __forceinline__ ProjectOp() {}
+            __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
        };

        void call(const PtrStepSz<float3> src, const float* rot,
@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device
            return x * x;
        }

+        template <int BLOCK_SIZE>
        __global__ void computeHypothesisScoresKernel(
                const int num_points, const float3* object, const float2* image,
                const float dist_threshold, int* g_num_inliers)
@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device
                    ++num_inliers;
            }

-            extern __shared__ float s_num_inliers[];
-            s_num_inliers[threadIdx.x] = num_inliers;
-            __syncthreads();
-
-            for (int step = blockDim.x / 2; step > 0; step >>= 1)
-            {
-                if (threadIdx.x < step)
-                    s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
-                __syncthreads();
-            }
+            __shared__ int s_num_inliers[BLOCK_SIZE];
+            reduce<BLOCK_SIZE>(s_num_inliers, num_inliers, threadIdx.x, plus<int>());

            if (threadIdx.x == 0)
-                g_num_inliers[blockIdx.x] = s_num_inliers[0];
+                g_num_inliers[blockIdx.x] = num_inliers;
        }

        void computeHypothesisScores(
@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device

            dim3 threads(256);
            dim3 grid(num_hypotheses);
-            int smem_size = threads.x * sizeof(float);

-            computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
+            computeHypothesisScoresKernel<256><<<grid, threads>>>(
                    num_points, object, image, dist_threshold, hypothesis_scores);
            cudaSafeCall( cudaGetLastError() );

--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@ -43,459 +43,451 @@
 #if !defined CUDA_DISABLER

 #include <utility>
-#include <algorithm>
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/emulation.hpp"
+#include "opencv2/gpu/device/transform.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace canny
+{
+    struct L1 : binary_function<int, int, float>
+    {
+        __device__ __forceinline__ float operator ()(int x, int y) const
+        {
+            return ::abs(x) + ::abs(y);
+        }
+
+        __device__ __forceinline__ L1() {}
+        __device__ __forceinline__ L1(const L1&) {}
+    };
+    struct L2 : binary_function<int, int, float>
+    {
+        __device__ __forceinline__ float operator ()(int x, int y) const
+        {
+            return ::sqrtf(x * x + y * y);
+        }
+
+        __device__ __forceinline__ L2() {}
+        __device__ __forceinline__ L2(const L2&) {}
+    };
+}

 namespace cv { namespace gpu { namespace device
 {
-    namespace canny
+    template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
    {
-        __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace canny
+{
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
+    struct SrcTex
+    {
+        const int xoff;
+        const int yoff;
+        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
+
+        __device__ __forceinline__ int operator ()(int y, int x) const
        {
-            __shared__ int smem[16][18];
+            return tex2D(tex_src, x + xoff, y + yoff);
+        }
+    };

-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+    template <class Norm> __global__
+    void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if (i < rows)
-            {
-                smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
-                if (threadIdx.x == 0)
-                {
-                    smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
-                    smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
-                }
-                __syncthreads();
+        if (y >= mag.rows || x >= mag.cols)
+            return;

-                if (j < cols)
-                {
-                    dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
-                    dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
-                }
-            }
+        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
+        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
+
+        dx(y, x) = dxVal;
+        dy(y, x) = dyVal;
+
+        mag(y, x) = norm(dxVal, dyVal);
+    }
+
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
+
+        bindTexture(&tex_src, srcWhole);
+        SrcTex src(xoff, yoff);
+
+        if (L2Grad)
+        {
+            L2 norm;
+            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+        }
+        else
+        {
+            L1 norm;
+            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
        }

-        void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    {
+        if (L2Grad)
        {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-            calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
+            L2 norm;
+            transform(dx, dy, mag, norm, WithOutMask(), 0);
        }
-
-        struct L1
+        else
        {
-            static __device__ __forceinline__ float calc(int x, int y)
-            {
-                return ::abs(x) + ::abs(y);
-            }
-        };
-        struct L2
-        {
-            static __device__ __forceinline__ float calc(int x, int y)
-            {
-                return ::sqrtf(x * x + y * y);
-            }
-        };
-
-        template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
-            PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
-        {
-            __shared__ int sdx[18][16];
-            __shared__ int sdy[18][16];
-
-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (j < cols)
-            {
-                sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
-                sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
-                if (threadIdx.y == 0)
-                {
-                    sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
-                    sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
-
-                    sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
-                    sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
-                }
-                __syncthreads();
-
-                if (i < rows)
-                {
-                    int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
-                    int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
-
-                    dx.ptr(i)[j] = x;
-                    dy.ptr(i)[j] = y;
-
-                    mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
-                }
-            }
+            L1 norm;
+            transform(dx, dy, mag, norm, WithOutMask(), 0);
        }
+    }
+}

-        void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
+    {
+        const int CANNY_SHIFT = 15;
+        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
+            return;
+
+        int dxVal = dx(y, x);
+        int dyVal = dy(y, x);
+
+        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
+        const float m = tex2D(tex_mag, x, y);
+
+        dxVal = ::abs(dxVal);
+        dyVal = ::abs(dyVal);
+
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+
+        if (m > low_thresh)
        {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+            const int tg22x = dxVal * TG22;
+            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);

-            if (L2Grad)
-                calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+            dyVal <<= CANNY_SHIFT;
+
+            if (dyVal < tg22x)
+            {
+                if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else if(dyVal > tg67x)
+            {
+                if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
            else
-                calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall(cudaThreadSynchronize());
+            {
+                if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
        }

-        template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
-        {
-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+        map(y, x) = edge_type;
+    }

-            if (i < rows && j < cols)
-                mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
+
+        bindTexture(&tex_mag, mag);
+
+        calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    __device__ int counter = 0;
+
+    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
+    {
+        __shared__ volatile int smem[18][18];
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
+        if (threadIdx.y == 0)
+            smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
+        if (threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
+        if (threadIdx.x == 0)
+            smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1)
+            smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
+        if (threadIdx.x == 0 && threadIdx.y == 0)
+            smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
+            smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
+        if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
+
+        __syncthreads();
+
+        if (x >= map.cols || y >= map.rows)
+            return;
+
+        int n;
+
+        #pragma unroll
+        for (int k = 0; k < 16; ++k)
+        {
+            n = 0;
+
+            if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
+            {
+                n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
+                n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
+
+                n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+
+                n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
+                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+            }
+
+            if (n > 0)
+                smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
        }

-        void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
+
+        map(y, x) = e;
+
+        n = 0;
+
+        if (e == 2)
        {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+            n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
+            n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;

-            if (L2Grad)
-                calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
-            else
-                calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
+            n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;

-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
+            n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
+            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
        }

-        //////////////////////////////////////////////////////////////////////////////////////////
-
-        #define CANNY_SHIFT 15
-        #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
-
-        __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+        if (n > 0)
        {
-            __shared__ float smem[18][18];
+            const int ind =  ::atomicAdd(&counter, 1);
+            st[ind] = make_ushort2(x, y);
+        }
+    }

-            const int j = blockIdx.x * 16 + threadIdx.x;
-            const int i = blockIdx.y * 16 + threadIdx.y;
+    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
+    {
+        void* counter_ptr;
+        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );

-            const int tid = threadIdx.y * 16 + threadIdx.x;
-            const int lx = tid % 18;
-            const int ly = tid / 18;
+        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );

-            if (ly < 14)
-                smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));

-            if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
-                smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+        edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+
+    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
+    {
+        const int stack_size = 512;
+
+        __shared__ int s_counter;
+        __shared__ int s_ind;
+        __shared__ ushort2 s_st[stack_size];
+
+        if (threadIdx.x == 0)
+            s_counter = 0;
+
+        __syncthreads();
+
+        int ind = blockIdx.y * gridDim.x + blockIdx.x;
+
+        if (ind >= count)
+            return;
+
+        ushort2 pos = st1[ind];
+
+        if (threadIdx.x < 8)
+        {
+            pos.x += c_dx[threadIdx.x];
+            pos.y += c_dy[threadIdx.x];
+
+            if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
+            {
+                map(pos.y, pos.x) = 2;
+
+                ind = Emulation::smem::atomicAdd(&s_counter, 1);
+
+                s_st[ind] = pos;
+            }
+        }
+
+        __syncthreads();
+
+        while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
+        {
+            const int subTaskIdx = threadIdx.x >> 3;
+            const int portion = ::min(s_counter, blockDim.x >> 3);
+
+            if (subTaskIdx < portion)
+                pos = s_st[s_counter - 1 - subTaskIdx];

            __syncthreads();

-            if (i < rows && j < cols)
-            {
-                int x = dx.ptr(i)[j];
-                int y = dy.ptr(i)[j];
-                const int s = (x ^ y) < 0 ? -1 : 1;
-                const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
-
-                x = ::abs(x);
-                y = ::abs(y);
-
-                // 0 - the pixel can not belong to an edge
-                // 1 - the pixel might belong to an edge
-                // 2 - the pixel does belong to an edge
-                int edge_type = 0;
-
-                if (m > low_thresh)
-                {
-                    const int tg22x = x * TG22;
-                    const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
-
-                    y <<= CANNY_SHIFT;
-
-                    if (y < tg22x)
-                    {
-                        if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
-                            edge_type = 1 + (int)(m > high_thresh);
-                    }
-                    else if( y > tg67x )
-                    {
-                        if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
-                            edge_type = 1 + (int)(m > high_thresh);
-                    }
-                    else
-                    {
-                        if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
-                            edge_type = 1 + (int)(m > high_thresh);
-                    }
-                }
-
-                map.ptr(i + 1)[j + 1] = edge_type;
-            }
-        }
-
-        #undef CANNY_SHIFT
-        #undef TG22
-
-        void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
-        {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-            calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        //////////////////////////////////////////////////////////////////////////////////////////
-
-        __device__ unsigned int counter = 0;
-
-        __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
-        {
-            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120)
-
-            __shared__ int smem[18][18];
-
-            const int j = blockIdx.x * 16 + threadIdx.x;
-            const int i = blockIdx.y * 16 + threadIdx.y;
-
-            const int tid = threadIdx.y * 16 + threadIdx.x;
-            const int lx = tid % 18;
-            const int ly = tid / 18;
-
-            if (ly < 14)
-                smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
-
-            if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
-                smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
-
-            __syncthreads();
-
-            if (i < rows && j < cols)
-            {
-                int n;
-
-                #pragma unroll
-                for (int k = 0; k < 16; ++k)
-                {
-                    n = 0;
-
-                    if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
-                    {
-                        n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
-                        n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
-                        n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
-
-                        n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
-                        n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
-
-                        n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
-                        n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
-                        n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
-                    }
-
-                    if (n > 0)
-                        smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
-                }
-
-                const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
-
-                map.ptr(i + 1)[j + 1] = e;
-
-                n = 0;
-
-                if (e == 2)
-                {
-                    n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
-                    n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
-                    n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
-
-                    n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
-                    n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
-
-                    n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
-                    n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
-                    n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
-                }
-
-                if (n > 0)
-                {
-                    const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
-                    st[ind] = make_ushort2(j + 1, i + 1);
-                }
-            }
-
-            #endif
-        }
-
-        void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
-        {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
-
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-            edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
-        __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
-
-        __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
-        {
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120
-
-            const int stack_size = 512;
-
-            __shared__ unsigned int s_counter;
-            __shared__ unsigned int s_ind;
-            __shared__ ushort2 s_st[stack_size];
-
            if (threadIdx.x == 0)
-                s_counter = 0;
+                s_counter -= portion;
+
            __syncthreads();

-            int ind = blockIdx.y * gridDim.x + blockIdx.x;
-
-            if (ind < count)
+            if (subTaskIdx < portion)
            {
-                ushort2 pos = st1[ind];
+                pos.x += c_dx[threadIdx.x & 7];
+                pos.y += c_dy[threadIdx.x & 7];

-                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+                if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
                {
-                    if (threadIdx.x < 8)
-                    {
-                        pos.x += c_dx[threadIdx.x];
-                        pos.y += c_dy[threadIdx.x];
+                    map(pos.y, pos.x) = 2;

-                        if (map.ptr(pos.y)[pos.x] == 1)
-                        {
-                            map.ptr(pos.y)[pos.x] = 2;
+                    ind = Emulation::smem::atomicAdd(&s_counter, 1);

-                            ind = atomicInc(&s_counter, (unsigned int)(-1));
-
-                            s_st[ind] = pos;
-                        }
-                    }
-                    __syncthreads();
-
-                    while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
-                    {
-                        const int subTaskIdx = threadIdx.x >> 3;
-                        const int portion = ::min(s_counter, blockDim.x >> 3);
-
-                        pos.x = pos.y = 0;
-
-                        if (subTaskIdx < portion)
-                            pos = s_st[s_counter - 1 - subTaskIdx];
-                        __syncthreads();
-
-                        if (threadIdx.x == 0)
-                            s_counter -= portion;
-                        __syncthreads();
-
-                        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
-                        {
-                            pos.x += c_dx[threadIdx.x & 7];
-                            pos.y += c_dy[threadIdx.x & 7];
-
-                            if (map.ptr(pos.y)[pos.x] == 1)
-                            {
-                                map.ptr(pos.y)[pos.x] = 2;
-
-                                ind = atomicInc(&s_counter, (unsigned int)(-1));
-
-                                s_st[ind] = pos;
-                            }
-                        }
-                        __syncthreads();
-                    }
-
-                    if (s_counter > 0)
-                    {
-                        if (threadIdx.x == 0)
-                        {
-                            ind = atomicAdd(&counter, s_counter);
-                            s_ind = ind - s_counter;
-                        }
-                        __syncthreads();
-
-                        ind = s_ind;
-
-                        for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
-                        {
-                            st2[ind + i] = s_st[i];
-                        }
-                    }
+                    s_st[ind] = pos;
                }
            }

-            #endif
+            __syncthreads();
        }

-        void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
+        if (s_counter > 0)
        {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-
-            unsigned int count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
-
-            while (count > 0)
+            if (threadIdx.x == 0)
            {
-                cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
-
-                dim3 block(128, 1, 1);
-                dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
-                edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
-
-                std::swap(st1, st2);
+                ind = ::atomicAdd(&counter, s_counter);
+                s_ind = ind - s_counter;
            }
+
+            __syncthreads();
+
+            ind = s_ind;
+
+            for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
+                st2[ind + i] = s_st[i];
        }
+    }

-        __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
+    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
+    {
+        void* counter_ptr;
+        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
+
+        int count;
+        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+        while (count > 0)
        {
-            const int j = blockIdx.x * 16 + threadIdx.x;
-            const int i = blockIdx.y * 16 + threadIdx.y;
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );

-            if (i < rows && j < cols)
-                dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
-        }
+            const dim3 block(128);
+            const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);

-        void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
-        {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-            getEdges<<<grid, block>>>(map, dst, rows, cols);
+            edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    } // namespace canny
-}}} // namespace cv { namespace gpu { namespace device

+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            std::swap(st1, st2);
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    struct GetEdges : unary_function<int, uchar>
+    {
+        __device__ __forceinline__ uchar operator ()(int e) const
+        {
+            return (uchar)(-(e >> 1));
+        }
+
+        __device__ __forceinline__ GetEdges() {}
+        __device__ __forceinline__ GetEdges(const GetEdges&) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace canny
+{
+    void getEdges(PtrStepSzi map, PtrStepSzb dst)
+    {
+        transform(map, dst, GetEdges(), WithOutMask(), 0);
+    }
+}

 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@ -497,6 +497,7 @@ namespace cv { namespace gpu { namespace device

        void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
        {
+            (void) flags;
            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
            dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));

--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@ -42,10 +42,10 @@

 #if !defined CUDA_DISABLER

-#include <internal_shared.hpp>
-#include <opencv2/gpu/device/transform.hpp>
-#include <opencv2/gpu/device/color.hpp>
-#include <cvt_colot_internal.h>
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/transform.hpp"
+#include "opencv2/gpu/device/color.hpp"
+#include "cvt_color_internal.h"

 namespace cv { namespace gpu { namespace device
 {
@ -224,7 +224,7 @@ namespace cv { namespace gpu { namespace device
    };

 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
-    void name(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream) \
+    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) \
    { \
        traits::functor_type functor = traits::create_functor(); \
        typedef typename traits::functor_type::argument_type src_t; \
@ -241,6 +241,10 @@ namespace cv { namespace gpu { namespace device
    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)

 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(name) \
    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
@ -339,46 +343,119 @@ namespace cv { namespace gpu { namespace device
    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls4)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgra)

    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL
 }}} // namespace cv { namespace gpu { namespace device

 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.0.cu
+++ b/modules/gpu/src/cuda/column_filter.0.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, uchar>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.1.cu
+++ b/modules/gpu/src/cuda/column_filter.1.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.10.cu
+++ b/modules/gpu/src/cuda/column_filter.10.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, unsigned short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.11.cu
+++ b/modules/gpu/src/cuda/column_filter.11.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, ushort3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.12.cu
+++ b/modules/gpu/src/cuda/column_filter.12.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, ushort4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.13.cu
+++ b/modules/gpu/src/cuda/column_filter.13.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, int3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.14.cu
+++ b/modules/gpu/src/cuda/column_filter.14.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, int4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.2.cu
+++ b/modules/gpu/src/cuda/column_filter.2.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.3.cu
+++ b/modules/gpu/src/cuda/column_filter.3.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.4.cu
+++ b/modules/gpu/src/cuda/column_filter.4.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, int>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.5.cu
+++ b/modules/gpu/src/cuda/column_filter.5.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.6.cu
+++ b/modules/gpu/src/cuda/column_filter.6.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.7.cu
+++ b/modules/gpu/src/cuda/column_filter.7.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.8.cu
+++ b/modules/gpu/src/cuda/column_filter.8.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.9.cu
+++ b/modules/gpu/src/cuda/column_filter.9.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, short4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@ -1,391 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/static_check.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace column_filter
-    {
-        #define MAX_KERNEL_SIZE 32
-
-        __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
-        void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
-        {
-            if (stream == 0)
-                cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-            else
-                cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-        }
-
-        template <int KSIZE, typename T, typename D, typename B>
-        __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
-        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
-                const int BLOCK_DIM_X = 16;
-                const int BLOCK_DIM_Y = 16;
-                const int PATCH_PER_BLOCK = 4;
-                const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
-            #else
-                const int BLOCK_DIM_X = 16;
-                const int BLOCK_DIM_Y = 8;
-                const int PATCH_PER_BLOCK = 2;
-                const int HALO_SIZE = 2;
-            #endif
-
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-            __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
-
-            const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
-
-            if (x >= src.cols)
-                return;
-
-            const T* src_col = src.ptr() + x;
-
-            const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
-
-            if (blockIdx.y > 0)
-            {
-                //Upper halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
-            }
-            else
-            {
-                //Upper halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
-            }
-
-            if (blockIdx.y + 2 < gridDim.y)
-            {
-                //Main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
-
-                //Lower halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
-            }
-            else
-            {
-                //Main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
-
-                //Lower halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
-            }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-            {
-                const int y = yStart + j * BLOCK_DIM_Y;
-
-                if (y < src.rows)
-                {
-                    sum_t sum = VecTraits<sum_t>::all(0);
-
-                    #pragma unroll
-                    for (int k = 0; k < KSIZE; ++k)
-                        sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
-
-                    dst(y, x) = saturate_cast<D>(sum);
-                }
-            }
-        }
-
-        template <int KSIZE, typename T, typename D, template<typename> class B>
-        void linearColumnFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
-        {
-            int BLOCK_DIM_X;
-            int BLOCK_DIM_Y;
-            int PATCH_PER_BLOCK;
-
-            if (cc >= 20)
-            {
-                BLOCK_DIM_X = 16;
-                BLOCK_DIM_Y = 16;
-                PATCH_PER_BLOCK = 4;
-            }
-            else
-            {
-                BLOCK_DIM_X = 16;
-                BLOCK_DIM_Y = 8;
-                PATCH_PER_BLOCK = 2;
-            }
-
-            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-            const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
-
-            B<T> brd(src.rows);
-
-            linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T, typename D>
-        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
-
-            static const caller_t callers[5][33] =
-            {
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 2, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 3, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 4, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 5, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 6, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 7, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 8, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 9, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<10, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<11, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<12, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<13, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<14, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<15, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<16, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<17, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<18, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<19, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<20, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<21, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<22, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<23, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<24, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<25, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<26, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<27, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<28, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<29, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<30, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<31, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<32, T, D, BrdColReflect101>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 2, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 3, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 4, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 5, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 6, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 7, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 8, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 9, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<10, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<11, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<12, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<13, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<14, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<15, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<16, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<17, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<18, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<19, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<20, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<21, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<22, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<23, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<24, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<25, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<26, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<27, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<28, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<29, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<30, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<31, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<32, T, D, BrdColReplicate>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 2, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 3, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 4, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 5, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 6, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 7, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 8, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 9, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<10, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<11, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<12, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<13, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<14, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<15, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<16, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<17, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<18, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<19, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<20, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<21, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<22, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<23, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<24, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<25, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<26, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<27, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<28, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<29, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<30, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<31, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<32, T, D, BrdColConstant>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 2, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 3, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 4, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 5, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 6, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 7, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 8, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 9, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<10, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<11, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<12, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<13, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<14, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<15, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<16, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<17, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<18, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<19, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<20, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<21, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<22, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<23, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<24, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<25, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<26, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<27, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<28, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<29, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<30, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<31, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<32, T, D, BrdColReflect>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 2, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 3, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 4, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 5, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 6, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 7, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 8, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 9, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<10, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<11, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<12, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<13, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<14, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<15, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<16, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<17, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<18, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<19, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<20, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<21, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<22, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<23, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<24, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<25, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<26, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<27, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<28, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<29, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<30, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<31, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<32, T, D, BrdColWrap>
-                }
-            };
-
-            loadKernel(kernel, ksize, stream);
-
-            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
-        }
-
-        template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float , int   >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-    } // namespace column_filter
-}}} // namespace cv { namespace gpu { namespace device
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.h
+++ b/modules/gpu/src/cuda/column_filter.h
@ -0,0 +1,373 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace column_filter
+{
+    #define MAX_KERNEL_SIZE 32
+
+    __constant__ float c_kernel[MAX_KERNEL_SIZE];
+
+    template <int KSIZE, typename T, typename D, typename B>
+    __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+    {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+            const int BLOCK_DIM_X = 16;
+            const int BLOCK_DIM_Y = 16;
+            const int PATCH_PER_BLOCK = 4;
+            const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
+        #else
+            const int BLOCK_DIM_X = 16;
+            const int BLOCK_DIM_Y = 8;
+            const int PATCH_PER_BLOCK = 2;
+            const int HALO_SIZE = 2;
+        #endif
+
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+        __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
+
+        const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
+
+        if (x >= src.cols)
+            return;
+
+        const T* src_col = src.ptr() + x;
+
+        const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
+
+        if (blockIdx.y > 0)
+        {
+            //Upper halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
+        }
+        else
+        {
+            //Upper halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
+        }
+
+        if (blockIdx.y + 2 < gridDim.y)
+        {
+            //Main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
+
+            //Lower halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
+        }
+        else
+        {
+            //Main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
+
+            //Lower halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
+        }
+
+        __syncthreads();
+
+        #pragma unroll
+        for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+        {
+            const int y = yStart + j * BLOCK_DIM_Y;
+
+            if (y < src.rows)
+            {
+                sum_t sum = VecTraits<sum_t>::all(0);
+
+                #pragma unroll
+                for (int k = 0; k < KSIZE; ++k)
+                    sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
+
+                dst(y, x) = saturate_cast<D>(sum);
+            }
+        }
+    }
+
+    template <int KSIZE, typename T, typename D, template<typename> class B>
+    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+    {
+        int BLOCK_DIM_X;
+        int BLOCK_DIM_Y;
+        int PATCH_PER_BLOCK;
+
+        if (cc >= 20)
+        {
+            BLOCK_DIM_X = 16;
+            BLOCK_DIM_Y = 16;
+            PATCH_PER_BLOCK = 4;
+        }
+        else
+        {
+            BLOCK_DIM_X = 16;
+            BLOCK_DIM_Y = 8;
+            PATCH_PER_BLOCK = 2;
+        }
+
+        const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+        const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
+
+        B<T> brd(src.rows);
+
+        linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+namespace filter
+{
+    template <typename T, typename D>
+    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+
+        static const caller_t callers[5][33] =
+        {
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColReflect101>,
+                column_filter::caller< 2, T, D, BrdColReflect101>,
+                column_filter::caller< 3, T, D, BrdColReflect101>,
+                column_filter::caller< 4, T, D, BrdColReflect101>,
+                column_filter::caller< 5, T, D, BrdColReflect101>,
+                column_filter::caller< 6, T, D, BrdColReflect101>,
+                column_filter::caller< 7, T, D, BrdColReflect101>,
+                column_filter::caller< 8, T, D, BrdColReflect101>,
+                column_filter::caller< 9, T, D, BrdColReflect101>,
+                column_filter::caller<10, T, D, BrdColReflect101>,
+                column_filter::caller<11, T, D, BrdColReflect101>,
+                column_filter::caller<12, T, D, BrdColReflect101>,
+                column_filter::caller<13, T, D, BrdColReflect101>,
+                column_filter::caller<14, T, D, BrdColReflect101>,
+                column_filter::caller<15, T, D, BrdColReflect101>,
+                column_filter::caller<16, T, D, BrdColReflect101>,
+                column_filter::caller<17, T, D, BrdColReflect101>,
+                column_filter::caller<18, T, D, BrdColReflect101>,
+                column_filter::caller<19, T, D, BrdColReflect101>,
+                column_filter::caller<20, T, D, BrdColReflect101>,
+                column_filter::caller<21, T, D, BrdColReflect101>,
+                column_filter::caller<22, T, D, BrdColReflect101>,
+                column_filter::caller<23, T, D, BrdColReflect101>,
+                column_filter::caller<24, T, D, BrdColReflect101>,
+                column_filter::caller<25, T, D, BrdColReflect101>,
+                column_filter::caller<26, T, D, BrdColReflect101>,
+                column_filter::caller<27, T, D, BrdColReflect101>,
+                column_filter::caller<28, T, D, BrdColReflect101>,
+                column_filter::caller<29, T, D, BrdColReflect101>,
+                column_filter::caller<30, T, D, BrdColReflect101>,
+                column_filter::caller<31, T, D, BrdColReflect101>,
+                column_filter::caller<32, T, D, BrdColReflect101>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColReplicate>,
+                column_filter::caller< 2, T, D, BrdColReplicate>,
+                column_filter::caller< 3, T, D, BrdColReplicate>,
+                column_filter::caller< 4, T, D, BrdColReplicate>,
+                column_filter::caller< 5, T, D, BrdColReplicate>,
+                column_filter::caller< 6, T, D, BrdColReplicate>,
+                column_filter::caller< 7, T, D, BrdColReplicate>,
+                column_filter::caller< 8, T, D, BrdColReplicate>,
+                column_filter::caller< 9, T, D, BrdColReplicate>,
+                column_filter::caller<10, T, D, BrdColReplicate>,
+                column_filter::caller<11, T, D, BrdColReplicate>,
+                column_filter::caller<12, T, D, BrdColReplicate>,
+                column_filter::caller<13, T, D, BrdColReplicate>,
+                column_filter::caller<14, T, D, BrdColReplicate>,
+                column_filter::caller<15, T, D, BrdColReplicate>,
+                column_filter::caller<16, T, D, BrdColReplicate>,
+                column_filter::caller<17, T, D, BrdColReplicate>,
+                column_filter::caller<18, T, D, BrdColReplicate>,
+                column_filter::caller<19, T, D, BrdColReplicate>,
+                column_filter::caller<20, T, D, BrdColReplicate>,
+                column_filter::caller<21, T, D, BrdColReplicate>,
+                column_filter::caller<22, T, D, BrdColReplicate>,
+                column_filter::caller<23, T, D, BrdColReplicate>,
+                column_filter::caller<24, T, D, BrdColReplicate>,
+                column_filter::caller<25, T, D, BrdColReplicate>,
+                column_filter::caller<26, T, D, BrdColReplicate>,
+                column_filter::caller<27, T, D, BrdColReplicate>,
+                column_filter::caller<28, T, D, BrdColReplicate>,
+                column_filter::caller<29, T, D, BrdColReplicate>,
+                column_filter::caller<30, T, D, BrdColReplicate>,
+                column_filter::caller<31, T, D, BrdColReplicate>,
+                column_filter::caller<32, T, D, BrdColReplicate>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColConstant>,
+                column_filter::caller< 2, T, D, BrdColConstant>,
+                column_filter::caller< 3, T, D, BrdColConstant>,
+                column_filter::caller< 4, T, D, BrdColConstant>,
+                column_filter::caller< 5, T, D, BrdColConstant>,
+                column_filter::caller< 6, T, D, BrdColConstant>,
+                column_filter::caller< 7, T, D, BrdColConstant>,
+                column_filter::caller< 8, T, D, BrdColConstant>,
+                column_filter::caller< 9, T, D, BrdColConstant>,
+                column_filter::caller<10, T, D, BrdColConstant>,
+                column_filter::caller<11, T, D, BrdColConstant>,
+                column_filter::caller<12, T, D, BrdColConstant>,
+                column_filter::caller<13, T, D, BrdColConstant>,
+                column_filter::caller<14, T, D, BrdColConstant>,
+                column_filter::caller<15, T, D, BrdColConstant>,
+                column_filter::caller<16, T, D, BrdColConstant>,
+                column_filter::caller<17, T, D, BrdColConstant>,
+                column_filter::caller<18, T, D, BrdColConstant>,
+                column_filter::caller<19, T, D, BrdColConstant>,
+                column_filter::caller<20, T, D, BrdColConstant>,
+                column_filter::caller<21, T, D, BrdColConstant>,
+                column_filter::caller<22, T, D, BrdColConstant>,
+                column_filter::caller<23, T, D, BrdColConstant>,
+                column_filter::caller<24, T, D, BrdColConstant>,
+                column_filter::caller<25, T, D, BrdColConstant>,
+                column_filter::caller<26, T, D, BrdColConstant>,
+                column_filter::caller<27, T, D, BrdColConstant>,
+                column_filter::caller<28, T, D, BrdColConstant>,
+                column_filter::caller<29, T, D, BrdColConstant>,
+                column_filter::caller<30, T, D, BrdColConstant>,
+                column_filter::caller<31, T, D, BrdColConstant>,
+                column_filter::caller<32, T, D, BrdColConstant>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColReflect>,
+                column_filter::caller< 2, T, D, BrdColReflect>,
+                column_filter::caller< 3, T, D, BrdColReflect>,
+                column_filter::caller< 4, T, D, BrdColReflect>,
+                column_filter::caller< 5, T, D, BrdColReflect>,
+                column_filter::caller< 6, T, D, BrdColReflect>,
+                column_filter::caller< 7, T, D, BrdColReflect>,
+                column_filter::caller< 8, T, D, BrdColReflect>,
+                column_filter::caller< 9, T, D, BrdColReflect>,
+                column_filter::caller<10, T, D, BrdColReflect>,
+                column_filter::caller<11, T, D, BrdColReflect>,
+                column_filter::caller<12, T, D, BrdColReflect>,
+                column_filter::caller<13, T, D, BrdColReflect>,
+                column_filter::caller<14, T, D, BrdColReflect>,
+                column_filter::caller<15, T, D, BrdColReflect>,
+                column_filter::caller<16, T, D, BrdColReflect>,
+                column_filter::caller<17, T, D, BrdColReflect>,
+                column_filter::caller<18, T, D, BrdColReflect>,
+                column_filter::caller<19, T, D, BrdColReflect>,
+                column_filter::caller<20, T, D, BrdColReflect>,
+                column_filter::caller<21, T, D, BrdColReflect>,
+                column_filter::caller<22, T, D, BrdColReflect>,
+                column_filter::caller<23, T, D, BrdColReflect>,
+                column_filter::caller<24, T, D, BrdColReflect>,
+                column_filter::caller<25, T, D, BrdColReflect>,
+                column_filter::caller<26, T, D, BrdColReflect>,
+                column_filter::caller<27, T, D, BrdColReflect>,
+                column_filter::caller<28, T, D, BrdColReflect>,
+                column_filter::caller<29, T, D, BrdColReflect>,
+                column_filter::caller<30, T, D, BrdColReflect>,
+                column_filter::caller<31, T, D, BrdColReflect>,
+                column_filter::caller<32, T, D, BrdColReflect>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColWrap>,
+                column_filter::caller< 2, T, D, BrdColWrap>,
+                column_filter::caller< 3, T, D, BrdColWrap>,
+                column_filter::caller< 4, T, D, BrdColWrap>,
+                column_filter::caller< 5, T, D, BrdColWrap>,
+                column_filter::caller< 6, T, D, BrdColWrap>,
+                column_filter::caller< 7, T, D, BrdColWrap>,
+                column_filter::caller< 8, T, D, BrdColWrap>,
+                column_filter::caller< 9, T, D, BrdColWrap>,
+                column_filter::caller<10, T, D, BrdColWrap>,
+                column_filter::caller<11, T, D, BrdColWrap>,
+                column_filter::caller<12, T, D, BrdColWrap>,
+                column_filter::caller<13, T, D, BrdColWrap>,
+                column_filter::caller<14, T, D, BrdColWrap>,
+                column_filter::caller<15, T, D, BrdColWrap>,
+                column_filter::caller<16, T, D, BrdColWrap>,
+                column_filter::caller<17, T, D, BrdColWrap>,
+                column_filter::caller<18, T, D, BrdColWrap>,
+                column_filter::caller<19, T, D, BrdColWrap>,
+                column_filter::caller<20, T, D, BrdColWrap>,
+                column_filter::caller<21, T, D, BrdColWrap>,
+                column_filter::caller<22, T, D, BrdColWrap>,
+                column_filter::caller<23, T, D, BrdColWrap>,
+                column_filter::caller<24, T, D, BrdColWrap>,
+                column_filter::caller<25, T, D, BrdColWrap>,
+                column_filter::caller<26, T, D, BrdColWrap>,
+                column_filter::caller<27, T, D, BrdColWrap>,
+                column_filter::caller<28, T, D, BrdColWrap>,
+                column_filter::caller<29, T, D, BrdColWrap>,
+                column_filter::caller<30, T, D, BrdColWrap>,
+                column_filter::caller<31, T, D, BrdColWrap>,
+                column_filter::caller<32, T, D, BrdColWrap>
+            }
+        };
+
+        if (stream == 0)
+            cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
+        else
+            cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
+
+        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+    }
+}
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/fgd_bgfg.cu
+++ b/modules/gpu/src/cuda/fgd_bgfg.cu
@ -46,6 +46,8 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/gpu/device/functional.hpp"
 #include "fgd_bgfg_common.hpp"

 using namespace cv::gpu;
@ -181,57 +183,8 @@ namespace bgfg
        __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
        __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];

-        data0[threadIdx.x] = sum0;
-        data1[threadIdx.x] = sum1;
-        data2[threadIdx.x] = sum2;
-        __syncthreads();
-
-        if (threadIdx.x < 128)
-        {
-            data0[threadIdx.x] = sum0 += data0[threadIdx.x + 128];
-            data1[threadIdx.x] = sum1 += data1[threadIdx.x + 128];
-            data2[threadIdx.x] = sum2 += data2[threadIdx.x + 128];
-        }
-        __syncthreads();
-
-        if (threadIdx.x < 64)
-        {
-            data0[threadIdx.x] = sum0 += data0[threadIdx.x + 64];
-            data1[threadIdx.x] = sum1 += data1[threadIdx.x + 64];
-            data2[threadIdx.x] = sum2 += data2[threadIdx.x + 64];
-        }
-        __syncthreads();
-
-        if (threadIdx.x < 32)
-        {
-            volatile unsigned int* vdata0 = data0;
-            volatile unsigned int* vdata1 = data1;
-            volatile unsigned int* vdata2 = data2;
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 32];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 32];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 32];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 16];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 16];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 16];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 8];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 8];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 8];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 4];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 4];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 4];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 2];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 2];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 2];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 1];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 1];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 1];
-        }
+        plus<unsigned int> op;
+        reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));

        if(threadIdx.x == 0)
        {
@ -245,9 +198,9 @@ namespace bgfg
    void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
-                               int cc, cudaStream_t stream)
+                               bool cc20, cudaStream_t stream)
    {
-        const int HISTOGRAM_WARP_COUNT = cc < 20 ? 4 : 6;
+        const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;

        calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
@ -261,10 +214,10 @@ namespace bgfg
            cudaSafeCall( cudaDeviceSynchronize() );
    }

-    template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);

    /////////////////////////////////////////////////////////////////////////
    // calcDiffThreshMask
--- a/modules/gpu/src/cuda/fgd_bgfg_common.hpp
+++ b/modules/gpu/src/cuda/fgd_bgfg_common.hpp
@ -125,7 +125,7 @@ namespace bgfg
    void calcDiffHistogram_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
-                               int cc, cudaStream_t stream);
+                               bool cc20, cudaStream_t stream);

    template <typename PT, typename CT>
    void calcDiffThreshMask_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);
--- a/modules/gpu/src/cuda/gftt.cu
+++ b/modules/gpu/src/cuda/gftt.cu
@ -47,6 +47,7 @@

 #if !defined CUDA_DISABLER

+#include <thrust/device_ptr.h>
 #include <thrust/sort.h>

 #include "opencv2/gpu/device/common.hpp"
--- a/modules/gpu/src/cuda/global_motion.cu
+++ b/modules/gpu/src/cuda/global_motion.cu
@ -43,12 +43,10 @@

 #if !defined CUDA_DISABLER

-#include "thrust/device_ptr.h"
-#include "thrust/remove.h"
-#include "thrust/functional.h"
-#include "internal_shared.hpp"
-
-using namespace thrust;
+#include <thrust/device_ptr.h>
+#include <thrust/remove.h>
+#include <thrust/functional.h>
+#include "opencv2/gpu/device/common.hpp"

 namespace cv { namespace gpu { namespace device { namespace globmotion {

@ -61,10 +59,10 @@ int compactPoints(int N, float *points0, float *points1, const uchar *mask)
    thrust::device_ptr<float2> dpoints1((float2*)points1);
    thrust::device_ptr<const uchar> dmask(mask);

-    return thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
+    return (int)(thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
                             thrust::make_zip_iterator(thrust::make_tuple(dpoints0 + N, dpoints1 + N)),
                             dmask, thrust::not1(thrust::identity<uchar>()))
-           - make_zip_iterator(make_tuple(dpoints0, dpoints1));
+           - thrust::make_zip_iterator(make_tuple(dpoints0, dpoints1)));
 }


--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@ -43,182 +43,112 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/emulation.hpp"
+#include "opencv2/gpu/device/transform.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace hist
+{
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
+    {
+        __shared__ int shist[256];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        shist[tid] = 0;
+        __syncthreads();
+
+        if (y < rows)
+        {
+            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
+
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+            {
+                unsigned int data = rowPtr[x];
+
+                Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
+            }
+
+            if (cols % 4 != 0 && threadIdx.x == 0)
+            {
+                for (int x = cols_4 * 4; x < cols; ++x)
+                {
+                    unsigned int data = ((const uchar*)rowPtr)[x];
+                    Emulation::smem::atomicAdd(&shist[data], 1);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        const int histVal = shist[tid];
+        if (histVal > 0)
+            ::atomicAdd(hist + tid, histVal);
+    }
+
+    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.rows, block.y));
+
+        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+namespace hist
+{
+    __constant__ int c_lut[256];
+
+    struct EqualizeHist : unary_function<uchar, uchar>
+    {
+        float scale;
+
+        __host__ EqualizeHist(float _scale) : scale(_scale) {}
+
+        __device__ __forceinline__ uchar operator ()(uchar val) const
+        {
+            const int lut = c_lut[val];
+            return __float2int_rn(scale * lut);
+        }
+    };
+}

 namespace cv { namespace gpu { namespace device
 {
-    #define UINT_BITS 32U
-
-    //Warps == subhistograms per threadblock
-    #define WARP_COUNT 6
-
-    //Threadblock size
-    #define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
-    #define HISTOGRAM256_BIN_COUNT 256
-
-    //Shared memory per threadblock
-    #define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
-
-    #define PARTIAL_HISTOGRAM256_COUNT 240
-
-    #define MERGE_THREADBLOCK_SIZE 256
-
-    #define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120))
-
-    namespace hist
+    template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
    {
-        #if (!USE_SMEM_ATOMICS)
-
-            #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
-
-            __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
-            {
-                uint count;
-                do
-                {
-                    count = s_WarpHist[data] & TAG_MASK;
-                    count = threadTag | (count + 1);
-                    s_WarpHist[data] = count;
-                } while (s_WarpHist[data] != count);
-            }
-
-        #else
-
-            #define TAG_MASK 0xFFFFFFFFU
-
-            __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
-            {
-                atomicAdd(s_WarpHist + data, 1);
-            }
-
-        #endif
-
-        __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
-        {
-            uint x = pos_x << 2;
-
-            if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
-            if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
-            if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
-            if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
-        }
-
-        __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
-        {
-            //Per-warp subhistogram storage
-            __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
-            uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
-
-            //Clear shared memory storage for current threadblock before processing
-            #pragma unroll
-            for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
-               s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
-
-            //Cycle through the entire data set, update subhistograms for each warp
-            const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
-
-            __syncthreads();
-            const uint colsui = d_Data.step / sizeof(uint);
-            for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
-            {
-                uint pos_y = pos / colsui;
-                uint pos_x = pos % colsui;
-                uint data = d_Data.ptr(pos_y)[pos_x];
-                addWord(s_WarpHist, data, tag, pos_x, cols);
-            }
-
-            //Merge per-warp histograms into per-block and write to global memory
-            __syncthreads();
-            for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
-            {
-                uint sum = 0;
-
-                for (uint i = 0; i < WARP_COUNT; i++)
-                    sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
-
-                d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
-            }
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////
-        // Merge histogram256() output
-        // Run one threadblock per bin; each threadblock adds up the same bin counter
-        // from every partial histogram. Reads are uncoalesced, but mergeHistogram256
-        // takes only a fraction of total processing time
-        ////////////////////////////////////////////////////////////////////////////////
-
-        __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
-        {
-            uint sum = 0;
-
-            #pragma unroll
-            for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
-                sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
-
-            __shared__ uint data[MERGE_THREADBLOCK_SIZE];
-            data[threadIdx.x] = sum;
-
-            for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
-            {
-                __syncthreads();
-                if(threadIdx.x < stride)
-                    data[threadIdx.x] += data[threadIdx.x + stride];
-            }
-
-            if(threadIdx.x == 0)
-                d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
-        }
-
-        void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream)
-        {
-            histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
-                PtrStepSz<uint>(src),
-                buf,
-                static_cast<uint>(src.rows * src.step / sizeof(uint)),
-                src.cols);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __constant__ int c_lut[256];
-
-        __global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < src.cols && y < src.rows)
-            {
-                const uchar val = src.ptr(y)[x];
-                const int lut = c_lut[val];
-                dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
-            }
-        }
-
-        void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
-        {
-            dim3 block(16, 16);
-            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+        enum { smart_shift = 4 };
+    };
+}}}

+namespace hist
+{
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
+    {
+        if (stream == 0)
            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
+        else
+            cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );

-            equalizeHist<<<grid, block, 0, stream>>>(src, dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    } // namespace hist
-}}} // namespace cv { namespace gpu { namespace device
+        const float scale = 255.0f / (src.cols * src.rows);

+        transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
+    }
+}

 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@ -42,7 +42,10 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"

 namespace cv { namespace gpu { namespace device
 {
@ -226,29 +229,32 @@ namespace cv { namespace gpu { namespace device


        template<int size>
-        __device__ float reduce_smem(volatile float* smem)
+        __device__ float reduce_smem(float* smem, float val)
        {
            unsigned int tid = threadIdx.x;
-            float sum = smem[tid];
+            float sum = val;

-            if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }
-            if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
-            if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
+            reduce<size>(smem, sum, tid, plus<float>());

-            if (tid < 32)
+            if (size == 32)
            {
-                if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
-                if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
-                if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
-                if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
-                if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
-                if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
+            #if __CUDA_ARCH__ >= 300
+                return shfl(sum, 0);
+            #else
+                return smem[0];
+            #endif
            }
+            else
+            {
+            #if __CUDA_ARCH__ >= 300
+                if (threadIdx.x == 0)
+                    smem[0] = sum;
+            #endif

-            __syncthreads();
-            sum = smem[0];
+                __syncthreads();

-            return sum;
+                return smem[0];
+            }
        }


@ -272,19 +278,13 @@ namespace cv { namespace gpu { namespace device
            if (threadIdx.x < block_hist_size)
                elem = hist[0];

-            squares[threadIdx.x] = elem * elem;
-
-            __syncthreads();
-            float sum = reduce_smem<nthreads>(squares);
+            float sum = reduce_smem<nthreads>(squares, elem * elem);

            float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
            elem = ::min(elem * scale, threshold);

-            __syncthreads();
-            squares[threadIdx.x] = elem * elem;
+            sum = reduce_smem<nthreads>(squares, elem * elem);

-            __syncthreads();
-            sum = reduce_smem<nthreads>(squares);
            scale = 1.0f / (::sqrtf(sum) + 1e-3f);

            if (threadIdx.x < block_hist_size)
@ -330,65 +330,36 @@ namespace cv { namespace gpu { namespace device

       // return confidence values not just positive location
       template <int nthreads, // Number of threads per one histogram block
-                           int nblocks> // Number of histogram block processed by single GPU thread block
+                 int nblocks>  // Number of histogram block processed by single GPU thread block
       __global__ void compute_confidence_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
                                                                                                           const int win_block_stride_x, const int win_block_stride_y,
                                                                                                           const float* block_hists, const float* coefs,
                                                                                                           float free_coef, float threshold, float* confidences)
       {
-               const int win_x = threadIdx.z;
-               if (blockIdx.x * blockDim.z + win_x >= img_win_width)
-                       return;
+           const int win_x = threadIdx.z;
+           if (blockIdx.x * blockDim.z + win_x >= img_win_width)
+                   return;

-               const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
-                                                                                    blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
-                                                                                   cblock_hist_size;
+           const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
+                                                                                blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
+                                                                               cblock_hist_size;

-               float product = 0.f;
-               for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
-               {
-                       int offset_y = i / cdescr_width;
-                       int offset_x = i - offset_y * cdescr_width;
-                       product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
-               }
+           float product = 0.f;
+           for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+           {
+                   int offset_y = i / cdescr_width;
+                   int offset_x = i - offset_y * cdescr_width;
+                   product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+           }

-               __shared__ float products[nthreads * nblocks];
+           __shared__ float products[nthreads * nblocks];

-               const int tid = threadIdx.z * nthreads + threadIdx.x;
-               products[tid] = product;
+           const int tid = threadIdx.z * nthreads + threadIdx.x;

-               __syncthreads();
+           reduce<nthreads>(products, product, tid, plus<float>());

-               if (nthreads >= 512)
-               {
-                       if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
-                       __syncthreads();
-               }
-               if (nthreads >= 256)
-               {
-                       if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
-                       __syncthreads();
-               }
-               if (nthreads >= 128)
-               {
-                       if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
-                       __syncthreads();
-               }
-
-               if (threadIdx.x < 32)
-               {
-                       volatile float* smem = products;
-                       if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
-                       if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
-                       if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
-                       if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
-                       if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
-                       if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
-               }
-
-               if (threadIdx.x == 0)
-                       confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x]
-                               = (float)(product + free_coef);
+           if (threadIdx.x == 0)
+               confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef;

       }

@ -396,32 +367,32 @@ namespace cv { namespace gpu { namespace device
                                               int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
                                               float* coefs, float free_coef, float threshold, float *confidences)
       {
-               const int nthreads = 256;
-               const int nblocks = 1;
+           const int nthreads = 256;
+           const int nblocks = 1;

-               int win_block_stride_x = win_stride_x / block_stride_x;
-               int win_block_stride_y = win_stride_y / block_stride_y;
-               int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
-               int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+           int win_block_stride_x = win_stride_x / block_stride_x;
+           int win_block_stride_y = win_stride_y / block_stride_y;
+           int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+           int img_win_height = (height - win_height + win_stride_y) / win_stride_y;

-               dim3 threads(nthreads, 1, nblocks);
-               dim3 grid(divUp(img_win_width, nblocks), img_win_height);
+           dim3 threads(nthreads, 1, nblocks);
+           dim3 grid(divUp(img_win_width, nblocks), img_win_height);

-               cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
-                                                                                       cudaFuncCachePreferL1));
+           cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
+                                                                                   cudaFuncCachePreferL1));

-               int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
-                                                           block_stride_x;
-               compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
-                       img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
-                       block_hists, coefs, free_coef, threshold, confidences);
-               cudaSafeCall(cudaThreadSynchronize());
+           int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+                                                       block_stride_x;
+           compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
+                   img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
+                   block_hists, coefs, free_coef, threshold, confidences);
+           cudaSafeCall(cudaThreadSynchronize());
       }



        template <int nthreads, // Number of threads per one histogram block
-                  int nblocks> // Number of histogram block processed by single GPU thread block
+                  int nblocks>  // Number of histogram block processed by single GPU thread block
        __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
                                                          const int win_block_stride_x, const int win_block_stride_y,
                                                          const float* block_hists, const float* coefs,
@ -446,36 +417,8 @@ namespace cv { namespace gpu { namespace device
            __shared__ float products[nthreads * nblocks];

            const int tid = threadIdx.z * nthreads + threadIdx.x;
-            products[tid] = product;

-            __syncthreads();
-
-            if (nthreads >= 512)
-            {
-                if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
-                __syncthreads();
-            }
-            if (nthreads >= 256)
-            {
-                if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
-                __syncthreads();
-            }
-            if (nthreads >= 128)
-            {
-                if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
-                __syncthreads();
-            }
-
-            if (threadIdx.x < 32)
-            {
-                volatile float* smem = products;
-                if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
-                if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
-                if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
-                if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
-                if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
-                if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
-            }
+            reduce<nthreads>(products, product, tid, plus<float>());

            if (threadIdx.x == 0)
                labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@ -42,7 +42,9 @@

 #if !defined CUDA_DISABLER

+#include <thrust/device_ptr.h>
 #include <thrust/sort.h>
+
 #include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/emulation.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@ -295,7 +295,7 @@ namespace cv { namespace gpu { namespace device
            int grid = divUp(workAmount, block);
            cudaFuncSetCacheConfig(lbp_cascade, cudaFuncCachePreferL1);
            Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
-            lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), integral.step / sizeof(int), objects, classified);
+            lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), (int)integral.step / sizeof(int), objects, classified);
        }
    }
 }}}
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@ -76,7 +76,7 @@ namespace cv { namespace gpu { namespace device
            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
            {
                float angle = ::atan2f(y_data, x_data);
-                angle += (angle < 0) * 2.0 * CV_PI;
+                angle += (angle < 0) * 2.0f * CV_PI_F;
                dst[y * dst_step + x] = scale * angle;
            }
        };
@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace device
            grid.x = divUp(x.cols, threads.x);
            grid.y = divUp(x.rows, threads.y);

-            const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
+            const float scale = angleInDegrees ? (180.0f / CV_PI_F) : 1.f;

            cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
@ -190,7 +190,7 @@ namespace cv { namespace gpu { namespace device
            grid.x = divUp(mag.cols, threads.x);
            grid.y = divUp(mag.rows, threads.y);

-            const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
+            const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;

            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
                angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/cuda/nlm.cu
+++ b/modules/gpu/src/cuda/nlm.cu
@ -43,11 +43,11 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
-
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/vec_traits.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/block.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"

 using namespace cv::gpu;
@ -184,6 +184,85 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
+
+        template <int cn> struct Unroll;
+        template <> struct Unroll<1>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
+            {
+                return thrust::tie(val1, val2);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op);
+            }
+        };
+        template <> struct Unroll<2>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op);
+            }
+        };
+        template <> struct Unroll<3>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y, val2.z);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op, op);
+            }
+        };
+        template <> struct Unroll<4>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op, op, op);
+            }
+        };
+
        __device__ __forceinline__ int calcDist(const uchar&  a, const uchar&  b) { return (a-b)*(a-b); }
        __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
        __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
@ -340,30 +419,15 @@ namespace cv { namespace gpu { namespace device
                    sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
                }

-                volatile __shared__ float cta_buffer[CTA_SIZE];
+                __shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)];

-                int tid = threadIdx.x;
+                reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer),
+                                 Unroll<VecTraits<T>::cn>::tie(weights_sum, sum),
+                                 threadIdx.x,
+                                 Unroll<VecTraits<T>::cn>::op());

-                cta_buffer[tid] = weights_sum;
-                __syncthreads();
-                Block::reduce<CTA_SIZE>(cta_buffer, plus());
-                weights_sum = cta_buffer[0];
-
-                __syncthreads();
-
-
-                for(int n = 0; n < VecTraits<T>::cn; ++n)
-                {
-                    cta_buffer[tid] = reinterpret_cast<float*>(&sum)[n];
-                    __syncthreads();
-                    Block::reduce<CTA_SIZE>(cta_buffer, plus());
-                    reinterpret_cast<float*>(&sum)[n] = cta_buffer[0];
-
-                    __syncthreads();
-                }
-
-                if (tid == 0)
-                    dst = saturate_cast<T>(sum/weights_sum);
+                if (threadIdx.x == 0)
+                    dst = saturate_cast<T>(sum / weights_sum);
            }

            __device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const
--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
@ -164,40 +164,40 @@ namespace cv { namespace gpu { namespace device

                r = ::fmin(r, 2.5f);

-                v[1].x = arrow_x + r * ::cosf(theta - CV_PI / 2.0f);
-                v[1].y = arrow_y + r * ::sinf(theta - CV_PI / 2.0f);
+                v[1].x = arrow_x + r * ::cosf(theta - CV_PI_F / 2.0f);
+                v[1].y = arrow_y + r * ::sinf(theta - CV_PI_F / 2.0f);

-                v[4].x = arrow_x + r * ::cosf(theta + CV_PI / 2.0f);
-                v[4].y = arrow_y + r * ::sinf(theta + CV_PI / 2.0f);
+                v[4].x = arrow_x + r * ::cosf(theta + CV_PI_F / 2.0f);
+                v[4].y = arrow_y + r * ::sinf(theta + CV_PI_F / 2.0f);

                int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;

-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                vertex_data[indx++] = v[0].x * xscale;
                vertex_data[indx++] = v[0].y * yscale;
                vertex_data[indx++] = v[0].z;

-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                vertex_data[indx++] = v[1].x * xscale;
                vertex_data[indx++] = v[1].y * yscale;
                vertex_data[indx++] = v[1].z;

-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                vertex_data[indx++] = v[2].x * xscale;
                vertex_data[indx++] = v[2].y * yscale;
                vertex_data[indx++] = v[2].z;

-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                vertex_data[indx++] = v[3].x * xscale;
                vertex_data[indx++] = v[3].y * yscale;
                vertex_data[indx++] = v[3].z;

-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                vertex_data[indx++] = v[4].x * xscale;
                vertex_data[indx++] = v[4].y * yscale;
                vertex_data[indx++] = v[4].z;

-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                vertex_data[indx++] = v[5].x * xscale;
                vertex_data[indx++] = v[5].y * yscale;
                vertex_data[indx++] = v[5].z;
--- a/modules/gpu/src/cuda/optical_flow_farneback.cu
+++ b/modules/gpu/src/cuda/optical_flow_farneback.cu
@ -42,7 +42,6 @@

 #if !defined CUDA_DISABLER

-#include <stdio.h>
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"
@ -57,8 +56,6 @@
 #define BORDER_SIZE 5
 #define MAX_KSIZE_HALF 100

-using namespace std;
-
 namespace cv { namespace gpu { namespace device { namespace optflow_farneback
 {
    __constant__ float c_g[8];
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@ -47,10 +47,11 @@

 #if !defined CUDA_DISABLER

+#include <thrust/device_ptr.h>
 #include <thrust/sort.h>

 #include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/functional.hpp"

 namespace cv { namespace gpu { namespace device
@ -75,9 +76,9 @@ namespace cv { namespace gpu { namespace device

        __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
        {
-            __shared__ int smem[8 * 32];
-
-            volatile int* srow = smem + threadIdx.y * blockDim.x;
+            __shared__ int smem0[8 * 32];
+            __shared__ int smem1[8 * 32];
+            __shared__ int smem2[8 * 32];

            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;

@ -109,9 +110,12 @@ namespace cv { namespace gpu { namespace device
                    c += Ix * Iy;
                }

-                reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
-                reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
-                reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
+                int* srow0 = smem0 + threadIdx.y * blockDim.x;
+                int* srow1 = smem1 + threadIdx.y * blockDim.x;
+                int* srow2 = smem2 + threadIdx.y * blockDim.x;
+
+                plus<int> op;
+                reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));

                if (threadIdx.x == 0)
                {
@ -151,9 +155,13 @@ namespace cv { namespace gpu { namespace device

        __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
        {
-            __shared__ int smem[8 * 32];
+            __shared__ int smem0[8 * 32];
+            __shared__ int smem1[8 * 32];

-            volatile int* srow = smem + threadIdx.y * blockDim.x;
+            int* srow0 = smem0 + threadIdx.y * blockDim.x;
+            int* srow1 = smem1 + threadIdx.y * blockDim.x;
+
+            plus<int> op;

            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;

@ -167,7 +175,7 @@ namespace cv { namespace gpu { namespace device
                for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
                    m_10 += u * image(loc.y, loc.x + u);

-                reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
+                reduce<32>(srow0, m_10, threadIdx.x, op);

                for (int v = 1; v <= half_k; ++v)
                {
@ -185,8 +193,7 @@ namespace cv { namespace gpu { namespace device
                        m_sum += u * (val_plus + val_minus);
                    }

-                    reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
-                    reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
+                    reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));

                    m_10 += m_sum;
                    m_01 += v * v_sum;
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@ -69,7 +69,7 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
        {
-            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
+            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
            {
                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;

@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, int)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, bool)
            {
                (void)srcWhole;
                (void)xoff;
@ -124,10 +124,10 @@ namespace cv { namespace gpu { namespace device
            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
            { \
                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float* borderValue, int cc) \
+                    PtrStepSz< type > dst, const float* borderValue, bool cc20) \
                { \
                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc >= 20 ? 8 : 4); \
+                    dim3 block(32, cc20 ? 8 : 4); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
                    bindTexture(&tex_remap_ ## type , srcWhole); \
                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
@ -142,7 +142,7 @@ namespace cv { namespace gpu { namespace device
            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
            { \
                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float*, int) \
+                    PtrStepSz< type > dst, const float*, bool) \
                { \
                    dim3 block(32, 8); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
@ -194,20 +194,20 @@ namespace cv { namespace gpu { namespace device
        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
        {
            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
            {
                if (stream == 0)
-                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc);
+                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc20);
                else
-                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
+                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc20);
            }
        };

        template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
        {
            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);

            static const caller_t callers[3][5] =
            {
@ -235,38 +235,38 @@ namespace cv { namespace gpu { namespace device
            };

            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
        }

-        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device

--- a/modules/gpu/src/cuda/row_filter.0.cu
+++ b/modules/gpu/src/cuda/row_filter.0.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<uchar, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.1.cu
+++ b/modules/gpu/src/cuda/row_filter.1.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<uchar3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.10.cu
+++ b/modules/gpu/src/cuda/row_filter.10.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<unsigned short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.11.cu
+++ b/modules/gpu/src/cuda/row_filter.11.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<ushort3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.12.cu
+++ b/modules/gpu/src/cuda/row_filter.12.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<ushort4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.13.cu
+++ b/modules/gpu/src/cuda/row_filter.13.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<int3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.14.cu
+++ b/modules/gpu/src/cuda/row_filter.14.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<int4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.2.cu
+++ b/modules/gpu/src/cuda/row_filter.2.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.3.cu
+++ b/modules/gpu/src/cuda/row_filter.3.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.4.cu
+++ b/modules/gpu/src/cuda/row_filter.4.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<int, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.5.cu
+++ b/modules/gpu/src/cuda/row_filter.5.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.6.cu
+++ b/modules/gpu/src/cuda/row_filter.6.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.7.cu
+++ b/modules/gpu/src/cuda/row_filter.7.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.8.cu
+++ b/modules/gpu/src/cuda/row_filter.8.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.9.cu
+++ b/modules/gpu/src/cuda/row_filter.9.cu
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<short4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@ -1,390 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/static_check.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace row_filter
-    {
-        #define MAX_KERNEL_SIZE 32
-
-        __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
-        void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
-        {
-            if (stream == 0)
-                cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-            else
-                cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-        }
-
-        template <int KSIZE, typename T, typename D, typename B>
-        __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
-        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
-                const int BLOCK_DIM_X = 32;
-                const int BLOCK_DIM_Y = 8;
-                const int PATCH_PER_BLOCK = 4;
-                const int HALO_SIZE = 1;
-            #else
-                const int BLOCK_DIM_X = 32;
-                const int BLOCK_DIM_Y = 4;
-                const int PATCH_PER_BLOCK = 4;
-                const int HALO_SIZE = 1;
-            #endif
-
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-            __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
-
-            const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
-
-            if (y >= src.rows)
-                return;
-
-            const T* src_row = src.ptr(y);
-
-            const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
-
-            if (blockIdx.x > 0)
-            {
-                //Load left halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
-            }
-            else
-            {
-                //Load left halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
-            }
-
-            if (blockIdx.x + 2 < gridDim.x)
-            {
-                //Load main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
-
-                //Load right halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
-            }
-            else
-            {
-                //Load main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
-
-                //Load right halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
-            }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-            {
-                const int x = xStart + j * BLOCK_DIM_X;
-
-                if (x < src.cols)
-                {
-                    sum_t sum = VecTraits<sum_t>::all(0);
-
-                    #pragma unroll
-                    for (int k = 0; k < KSIZE; ++k)
-                        sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
-
-                    dst(y, x) = saturate_cast<D>(sum);
-                }
-            }
-        }
-
-        template <int KSIZE, typename T, typename D, template<typename> class B>
-        void linearRowFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
-        {
-            int BLOCK_DIM_X;
-            int BLOCK_DIM_Y;
-            int PATCH_PER_BLOCK;
-
-            if (cc >= 20)
-            {
-                BLOCK_DIM_X = 32;
-                BLOCK_DIM_Y = 8;
-                PATCH_PER_BLOCK = 4;
-            }
-            else
-            {
-                BLOCK_DIM_X = 32;
-                BLOCK_DIM_Y = 4;
-                PATCH_PER_BLOCK = 4;
-            }
-
-            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-            const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
-
-            B<T> brd(src.cols);
-
-            linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T, typename D>
-        void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
-
-            static const caller_t callers[5][33] =
-            {
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 2, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 3, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 4, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 5, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 6, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 7, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 8, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 9, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<10, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<11, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<12, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<13, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<14, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<15, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<16, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<17, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<18, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<19, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<20, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<21, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<22, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<23, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<24, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<25, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<26, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<27, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<28, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<29, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<30, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<31, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<32, T, D, BrdRowReflect101>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 2, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 3, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 4, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 5, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 6, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 7, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 8, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 9, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<10, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<11, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<12, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<13, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<14, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<15, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<16, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<17, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<18, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<19, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<20, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<21, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<22, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<23, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<24, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<25, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<26, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<27, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<28, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<29, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<30, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<31, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<32, T, D, BrdRowReplicate>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 2, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 3, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 4, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 5, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 6, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 7, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 8, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 9, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<10, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<11, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<12, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<13, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<14, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<15, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<16, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<17, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<18, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<19, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<20, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<21, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<22, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<23, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<24, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<25, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<26, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<27, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<28, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<29, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<30, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<31, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<32, T, D, BrdRowConstant>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 2, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 3, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 4, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 5, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 6, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 7, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 8, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 9, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<10, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<11, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<12, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<13, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<14, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<15, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<16, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<17, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<18, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<19, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<20, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<21, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<22, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<23, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<24, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<25, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<26, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<27, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<28, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<29, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<30, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<31, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<32, T, D, BrdRowReflect>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 2, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 3, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 4, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 5, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 6, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 7, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 8, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 9, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<10, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<11, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<12, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<13, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<14, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<15, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<16, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<17, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<18, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<19, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<20, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<21, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<22, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<23, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<24, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<25, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<26, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<27, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<28, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<29, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<30, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<31, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<32, T, D, BrdRowWrap>
-                }
-            };
-
-            loadKernel(kernel, ksize, stream);
-
-            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
-        }
-
-        template void linearRowFilter_gpu<uchar , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<uchar3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<int   , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-    } // namespace row_filter
-}}} // namespace cv { namespace gpu { namespace device
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.h
+++ b/modules/gpu/src/cuda/row_filter.h
@ -0,0 +1,372 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace row_filter
+{
+    #define MAX_KERNEL_SIZE 32
+
+    __constant__ float c_kernel[MAX_KERNEL_SIZE];
+
+    template <int KSIZE, typename T, typename D, typename B>
+    __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+    {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+            const int BLOCK_DIM_X = 32;
+            const int BLOCK_DIM_Y = 8;
+            const int PATCH_PER_BLOCK = 4;
+            const int HALO_SIZE = 1;
+        #else
+            const int BLOCK_DIM_X = 32;
+            const int BLOCK_DIM_Y = 4;
+            const int PATCH_PER_BLOCK = 4;
+            const int HALO_SIZE = 1;
+        #endif
+
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+        __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
+
+        const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+
+        if (y >= src.rows)
+            return;
+
+        const T* src_row = src.ptr(y);
+
+        const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
+
+        if (blockIdx.x > 0)
+        {
+            //Load left halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
+        }
+        else
+        {
+            //Load left halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
+        }
+
+        if (blockIdx.x + 2 < gridDim.x)
+        {
+            //Load main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
+
+            //Load right halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
+        }
+        else
+        {
+            //Load main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
+
+            //Load right halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
+        }
+
+        __syncthreads();
+
+        #pragma unroll
+        for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+        {
+            const int x = xStart + j * BLOCK_DIM_X;
+
+            if (x < src.cols)
+            {
+                sum_t sum = VecTraits<sum_t>::all(0);
+
+                #pragma unroll
+                for (int k = 0; k < KSIZE; ++k)
+                    sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
+
+                dst(y, x) = saturate_cast<D>(sum);
+            }
+        }
+    }
+
+    template <int KSIZE, typename T, typename D, template<typename> class B>
+    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+    {
+        int BLOCK_DIM_X;
+        int BLOCK_DIM_Y;
+        int PATCH_PER_BLOCK;
+
+        if (cc >= 20)
+        {
+            BLOCK_DIM_X = 32;
+            BLOCK_DIM_Y = 8;
+            PATCH_PER_BLOCK = 4;
+        }
+        else
+        {
+            BLOCK_DIM_X = 32;
+            BLOCK_DIM_Y = 4;
+            PATCH_PER_BLOCK = 4;
+        }
+
+        const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+        const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
+
+        B<T> brd(src.cols);
+
+        linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+namespace filter
+{
+    template <typename T, typename D>
+    void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+
+        static const caller_t callers[5][33] =
+        {
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowReflect101>,
+                row_filter::caller< 2, T, D, BrdRowReflect101>,
+                row_filter::caller< 3, T, D, BrdRowReflect101>,
+                row_filter::caller< 4, T, D, BrdRowReflect101>,
+                row_filter::caller< 5, T, D, BrdRowReflect101>,
+                row_filter::caller< 6, T, D, BrdRowReflect101>,
+                row_filter::caller< 7, T, D, BrdRowReflect101>,
+                row_filter::caller< 8, T, D, BrdRowReflect101>,
+                row_filter::caller< 9, T, D, BrdRowReflect101>,
+                row_filter::caller<10, T, D, BrdRowReflect101>,
+                row_filter::caller<11, T, D, BrdRowReflect101>,
+                row_filter::caller<12, T, D, BrdRowReflect101>,
+                row_filter::caller<13, T, D, BrdRowReflect101>,
+                row_filter::caller<14, T, D, BrdRowReflect101>,
+                row_filter::caller<15, T, D, BrdRowReflect101>,
+                row_filter::caller<16, T, D, BrdRowReflect101>,
+                row_filter::caller<17, T, D, BrdRowReflect101>,
+                row_filter::caller<18, T, D, BrdRowReflect101>,
+                row_filter::caller<19, T, D, BrdRowReflect101>,
+                row_filter::caller<20, T, D, BrdRowReflect101>,
+                row_filter::caller<21, T, D, BrdRowReflect101>,
+                row_filter::caller<22, T, D, BrdRowReflect101>,
+                row_filter::caller<23, T, D, BrdRowReflect101>,
+                row_filter::caller<24, T, D, BrdRowReflect101>,
+                row_filter::caller<25, T, D, BrdRowReflect101>,
+                row_filter::caller<26, T, D, BrdRowReflect101>,
+                row_filter::caller<27, T, D, BrdRowReflect101>,
+                row_filter::caller<28, T, D, BrdRowReflect101>,
+                row_filter::caller<29, T, D, BrdRowReflect101>,
+                row_filter::caller<30, T, D, BrdRowReflect101>,
+                row_filter::caller<31, T, D, BrdRowReflect101>,
+                row_filter::caller<32, T, D, BrdRowReflect101>
+            },
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowReplicate>,
+                row_filter::caller< 2, T, D, BrdRowReplicate>,
+                row_filter::caller< 3, T, D, BrdRowReplicate>,
+                row_filter::caller< 4, T, D, BrdRowReplicate>,
+                row_filter::caller< 5, T, D, BrdRowReplicate>,
+                row_filter::caller< 6, T, D, BrdRowReplicate>,
+                row_filter::caller< 7, T, D, BrdRowReplicate>,
+                row_filter::caller< 8, T, D, BrdRowReplicate>,
+                row_filter::caller< 9, T, D, BrdRowReplicate>,
+                row_filter::caller<10, T, D, BrdRowReplicate>,
+                row_filter::caller<11, T, D, BrdRowReplicate>,
+                row_filter::caller<12, T, D, BrdRowReplicate>,
+                row_filter::caller<13, T, D, BrdRowReplicate>,
+                row_filter::caller<14, T, D, BrdRowReplicate>,
+                row_filter::caller<15, T, D, BrdRowReplicate>,
+                row_filter::caller<16, T, D, BrdRowReplicate>,
+                row_filter::caller<17, T, D, BrdRowReplicate>,
+                row_filter::caller<18, T, D, BrdRowReplicate>,
+                row_filter::caller<19, T, D, BrdRowReplicate>,
+                row_filter::caller<20, T, D, BrdRowReplicate>,
+                row_filter::caller<21, T, D, BrdRowReplicate>,
+                row_filter::caller<22, T, D, BrdRowReplicate>,
+                row_filter::caller<23, T, D, BrdRowReplicate>,
+                row_filter::caller<24, T, D, BrdRowReplicate>,
+                row_filter::caller<25, T, D, BrdRowReplicate>,
+                row_filter::caller<26, T, D, BrdRowReplicate>,
+                row_filter::caller<27, T, D, BrdRowReplicate>,
+                row_filter::caller<28, T, D, BrdRowReplicate>,
+                row_filter::caller<29, T, D, BrdRowReplicate>,
+                row_filter::caller<30, T, D, BrdRowReplicate>,
+                row_filter::caller<31, T, D, BrdRowReplicate>,
+                row_filter::caller<32, T, D, BrdRowReplicate>
+            },
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowConstant>,
+                row_filter::caller< 2, T, D, BrdRowConstant>,
+                row_filter::caller< 3, T, D, BrdRowConstant>,
+                row_filter::caller< 4, T, D, BrdRowConstant>,
+                row_filter::caller< 5, T, D, BrdRowConstant>,
+                row_filter::caller< 6, T, D, BrdRowConstant>,
+                row_filter::caller< 7, T, D, BrdRowConstant>,
+                row_filter::caller< 8, T, D, BrdRowConstant>,
+                row_filter::caller< 9, T, D, BrdRowConstant>,
+                row_filter::caller<10, T, D, BrdRowConstant>,
+                row_filter::caller<11, T, D, BrdRowConstant>,
+                row_filter::caller<12, T, D, BrdRowConstant>,
+                row_filter::caller<13, T, D, BrdRowConstant>,
+                row_filter::caller<14, T, D, BrdRowConstant>,
+                row_filter::caller<15, T, D, BrdRowConstant>,
+                row_filter::caller<16, T, D, BrdRowConstant>,
+                row_filter::caller<17, T, D, BrdRowConstant>,
+                row_filter::caller<18, T, D, BrdRowConstant>,
+                row_filter::caller<19, T, D, BrdRowConstant>,
+                row_filter::caller<20, T, D, BrdRowConstant>,
+                row_filter::caller<21, T, D, BrdRowConstant>,
+                row_filter::caller<22, T, D, BrdRowConstant>,
+                row_filter::caller<23, T, D, BrdRowConstant>,
+                row_filter::caller<24, T, D, BrdRowConstant>,
+                row_filter::caller<25, T, D, BrdRowConstant>,
+                row_filter::caller<26, T, D, BrdRowConstant>,
+                row_filter::caller<27, T, D, BrdRowConstant>,
+                row_filter::caller<28, T, D, BrdRowConstant>,
+                row_filter::caller<29, T, D, BrdRowConstant>,
+                row_filter::caller<30, T, D, BrdRowConstant>,
+                row_filter::caller<31, T, D, BrdRowConstant>,
+                row_filter::caller<32, T, D, BrdRowConstant>
+            },
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowReflect>,
+                row_filter::caller< 2, T, D, BrdRowReflect>,
+                row_filter::caller< 3, T, D, BrdRowReflect>,
+                row_filter::caller< 4, T, D, BrdRowReflect>,
+                row_filter::caller< 5, T, D, BrdRowReflect>,
+                row_filter::caller< 6, T, D, BrdRowReflect>,
+                row_filter::caller< 7, T, D, BrdRowReflect>,
+                row_filter::caller< 8, T, D, BrdRowReflect>,
+                row_filter::caller< 9, T, D, BrdRowReflect>,
+                row_filter::caller<10, T, D, BrdRowReflect>,
+                row_filter::caller<11, T, D, BrdRowReflect>,
+                row_filter::caller<12, T, D, BrdRowReflect>,
+                row_filter::caller<13, T, D, BrdRowReflect>,
+                row_filter::caller<14, T, D, BrdRowReflect>,
+                row_filter::caller<15, T, D, BrdRowReflect>,
+                row_filter::caller<16, T, D, BrdRowReflect>,
+                row_filter::caller<17, T, D, BrdRowReflect>,
+                row_filter::caller<18, T, D, BrdRowReflect>,
+                row_filter::caller<19, T, D, BrdRowReflect>,
+                row_filter::caller<20, T, D, BrdRowReflect>,
+                row_filter::caller<21, T, D, BrdRowReflect>,
+                row_filter::caller<22, T, D, BrdRowReflect>,
+                row_filter::caller<23, T, D, BrdRowReflect>,
+                row_filter::caller<24, T, D, BrdRowReflect>,
+                row_filter::caller<25, T, D, BrdRowReflect>,
+                row_filter::caller<26, T, D, BrdRowReflect>,
+                row_filter::caller<27, T, D, BrdRowReflect>,
+                row_filter::caller<28, T, D, BrdRowReflect>,
+                row_filter::caller<29, T, D, BrdRowReflect>,
+                row_filter::caller<30, T, D, BrdRowReflect>,
+                row_filter::caller<31, T, D, BrdRowReflect>,
+                row_filter::caller<32, T, D, BrdRowReflect>
+            },
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowWrap>,
+                row_filter::caller< 2, T, D, BrdRowWrap>,
+                row_filter::caller< 3, T, D, BrdRowWrap>,
+                row_filter::caller< 4, T, D, BrdRowWrap>,
+                row_filter::caller< 5, T, D, BrdRowWrap>,
+                row_filter::caller< 6, T, D, BrdRowWrap>,
+                row_filter::caller< 7, T, D, BrdRowWrap>,
+                row_filter::caller< 8, T, D, BrdRowWrap>,
+                row_filter::caller< 9, T, D, BrdRowWrap>,
+                row_filter::caller<10, T, D, BrdRowWrap>,
+                row_filter::caller<11, T, D, BrdRowWrap>,
+                row_filter::caller<12, T, D, BrdRowWrap>,
+                row_filter::caller<13, T, D, BrdRowWrap>,
+                row_filter::caller<14, T, D, BrdRowWrap>,
+                row_filter::caller<15, T, D, BrdRowWrap>,
+                row_filter::caller<16, T, D, BrdRowWrap>,
+                row_filter::caller<17, T, D, BrdRowWrap>,
+                row_filter::caller<18, T, D, BrdRowWrap>,
+                row_filter::caller<19, T, D, BrdRowWrap>,
+                row_filter::caller<20, T, D, BrdRowWrap>,
+                row_filter::caller<21, T, D, BrdRowWrap>,
+                row_filter::caller<22, T, D, BrdRowWrap>,
+                row_filter::caller<23, T, D, BrdRowWrap>,
+                row_filter::caller<24, T, D, BrdRowWrap>,
+                row_filter::caller<25, T, D, BrdRowWrap>,
+                row_filter::caller<26, T, D, BrdRowWrap>,
+                row_filter::caller<27, T, D, BrdRowWrap>,
+                row_filter::caller<28, T, D, BrdRowWrap>,
+                row_filter::caller<29, T, D, BrdRowWrap>,
+                row_filter::caller<30, T, D, BrdRowWrap>,
+                row_filter::caller<31, T, D, BrdRowWrap>,
+                row_filter::caller<32, T, D, BrdRowWrap>
+            }
+        };
+
+        if (stream == 0)
+            cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
+        else
+            cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
+
+        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+    }
+}
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@ -454,7 +454,7 @@ namespace cv { namespace gpu { namespace device
            grid.x = divUp(cols, threads.x << 1);
            grid.y = divUp(rows, threads.y);

-            int elem_step = u.step/sizeof(T);
+            int elem_step = (int)(u.step / sizeof(T));

            for(int t = 0; t < iters; ++t)
            {
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@ -42,9 +42,11 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/gpu/device/functional.hpp"

 namespace cv { namespace gpu { namespace device
 {
@ -297,28 +299,13 @@ namespace cv { namespace gpu { namespace device
                }

                extern __shared__ float smem[];
-                float* dline = smem + winsz * threadIdx.z;

-                dline[tid] = val;
-
-                __syncthreads();
-
-                if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
-                if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }
-
-                volatile float* vdline = smem + winsz * threadIdx.z;
-
-                if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
-                if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
-                if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];
-                if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];
-                if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];
-                if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];
+                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());

                T* data_cost = (T*)ctemp + y_out * cmsg_step + x_out;

                if (tid == 0)
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
+                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
            }
        }

@ -496,26 +483,11 @@ namespace cv { namespace gpu { namespace device
                }

                extern __shared__ float smem[];
-                float* dline = smem + winsz * threadIdx.z;

-                dline[tid] = val;
-
-                __syncthreads();
-
-                if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
-                if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid +  64]; } __syncthreads(); }
-
-                volatile float* vdline = smem + winsz * threadIdx.z;
-
-                if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
-                if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
-                if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];
-                if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];
-                if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];
-                if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];
+                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());

                if (tid == 0)
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
+                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
            }
        }

--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@ -47,13 +47,13 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/functional.hpp"
 #include "opencv2/gpu/device/filters.hpp"
-#include <float.h>

 namespace cv { namespace gpu { namespace device
 {
@ -568,7 +568,9 @@ namespace cv { namespace gpu { namespace device

            float bestx = 0, besty = 0, best_mod = 0;

+        #if __CUDA_ARCH__ >= 200
            #pragma unroll
+        #endif
            for (int i = 0; i < 18; ++i)
            {
                const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
@ -599,8 +601,9 @@ namespace cv { namespace gpu { namespace device
                    sumy += s_Y[threadIdx.x + 96];
                }

-                device::reduce<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
-                device::reduce<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
+                plus<float> op;
+                device::reduce<32>(smem_tuple(s_sumx + threadIdx.y * 32, s_sumy + threadIdx.y * 32),
+                                   thrust::tie(sumx, sumy), threadIdx.x, thrust::make_tuple(op, op));

                const float temp_mod = sumx * sumx + sumy * sumy;
                if (temp_mod > best_mod)
@ -638,7 +641,7 @@ namespace cv { namespace gpu { namespace device
                kp_dir *= 180.0f / CV_PI_F;

                kp_dir = 360.0f - kp_dir;
-                if (abs(kp_dir - 360.f) < FLT_EPSILON)
+                if (::fabsf(kp_dir - 360.f) < numeric_limits<float>::epsilon())
                    kp_dir = 0.f;

                featureDir[blockIdx.x] = kp_dir;
@ -697,11 +700,6 @@ namespace cv { namespace gpu { namespace device
        {
            typedef uchar elem_type;

-            __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) :
-                centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)
-            {
-            }
-
            __device__ __forceinline__ uchar operator ()(int i, int j) const
            {
                float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
@ -715,285 +713,215 @@ namespace cv { namespace gpu { namespace device
            float win_offset;
            float cos_dir;
            float sin_dir;
+            int width;
+            int height;
        };

-        __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25],
-            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
+                                   float& dx, float& dy)
        {
-            __shared__ float s_PATCH[6][6];
+            __shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];

-            const float centerX = featureX[blockIdx.x];
-            const float centerY = featureY[blockIdx.x];
-            const float size = featureSize[blockIdx.x];
-            float descriptor_dir = 360.0f - featureDir[blockIdx.x];
-            if (std::abs(descriptor_dir - 360.f) < FLT_EPSILON)
-                descriptor_dir = 0.f;
-            descriptor_dir *= (float)(CV_PI_F / 180.0f);
+            dx = dy = 0.0f;

-            /* The sampling intervals and wavelet sized for selecting an orientation
-             and building the keypoint descriptor are defined relative to 's' */
-            const float s = size * 1.2f / 9.0f;
+            WinReader win;

-            /* Extract a window of pixels around the keypoint of size 20s */
+            win.centerX = featureX[blockIdx.x];
+            win.centerY = featureY[blockIdx.x];
+
+            // The sampling intervals and wavelet sized for selecting an orientation
+            // and building the keypoint descriptor are defined relative to 's'
+            const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
+
+            // Extract a window of pixels around the keypoint of size 20s
            const int win_size = (int)((PATCH_SZ + 1) * s);

-            float sin_dir;
-            float cos_dir;
-            sincosf(descriptor_dir, &sin_dir, &cos_dir);
+            win.width = win.height = win_size;

-            /* Nearest neighbour version (faster) */
-            const float win_offset = -(float)(win_size - 1) / 2;
-
-            // Compute sampling points
-            // since grids are 2D, need to compute xBlock and yBlock indices
-            const int xBlock = (blockIdx.y & 3);  // blockIdx.y % 4
-            const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
-            const int xIndex = xBlock * 5 + threadIdx.x;
-            const int yIndex = yBlock * 5 + threadIdx.y;
-
-            const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
-            const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
-
-            LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));
-
-            s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);
-
-            __syncthreads();
-
-            if (threadIdx.x < 5 && threadIdx.y < 5)
-            {
-                const int tid = threadIdx.y * 5 + threadIdx.x;
-
-                const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
-
-                const float vx = (s_PATCH[threadIdx.y    ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x    ]) * dw;
-                const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x    ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y    ][threadIdx.x + 1]) * dw;
-
-                s_dx_bin[tid] = vx;
-                s_dy_bin[tid] = vy;
-            }
-        }
-
-        __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)
-        {
-            // first step is to reduce from 25 to 16
-            if (tid < 9) // use 9 threads
-            {
-                sdata1[tid] += sdata1[tid + 16];
-                sdata2[tid] += sdata2[tid + 16];
-                sdata3[tid] += sdata3[tid + 16];
-                sdata4[tid] += sdata4[tid + 16];
-            }
-
-            // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
-            if (tid < 8)
-            {
-                sdata1[tid] += sdata1[tid + 8];
-                sdata1[tid] += sdata1[tid + 4];
-                sdata1[tid] += sdata1[tid + 2];
-                sdata1[tid] += sdata1[tid + 1];
-
-                sdata2[tid] += sdata2[tid + 8];
-                sdata2[tid] += sdata2[tid + 4];
-                sdata2[tid] += sdata2[tid + 2];
-                sdata2[tid] += sdata2[tid + 1];
-
-                sdata3[tid] += sdata3[tid + 8];
-                sdata3[tid] += sdata3[tid + 4];
-                sdata3[tid] += sdata3[tid + 2];
-                sdata3[tid] += sdata3[tid + 1];
-
-                sdata4[tid] += sdata4[tid + 8];
-                sdata4[tid] += sdata4[tid + 4];
-                sdata4[tid] += sdata4[tid + 2];
-                sdata4[tid] += sdata4[tid + 1];
-            }
-        }
-
-        __global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
-        {
-            // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
-            __shared__ float sdx[25];
-            __shared__ float sdy[25];
-            __shared__ float sdxabs[25];
-            __shared__ float sdyabs[25];
-
-            calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
-            __syncthreads();
+            // Nearest neighbour version (faster)
+            win.win_offset = -(win_size - 1.0f) / 2.0f;

+            float descriptor_dir = 360.0f - featureDir[blockIdx.x];
+            if (::fabsf(descriptor_dir - 360.f) < numeric_limits<float>::epsilon())
+                descriptor_dir = 0.f;
+            descriptor_dir *= CV_PI_F / 180.0f;
+            sincosf(descriptor_dir, &win.sin_dir, &win.cos_dir);

            const int tid = threadIdx.y * blockDim.x + threadIdx.x;

-            if (tid < 25)
+            const int xLoadInd = tid % (PATCH_SZ + 1);
+            const int yLoadInd = tid / (PATCH_SZ + 1);
+
+            if (yLoadInd < (PATCH_SZ + 1))
            {
-                sdxabs[tid] = ::fabs(sdx[tid]); // |dx| array
-                sdyabs[tid] = ::fabs(sdy[tid]); // |dy| array
-                __syncthreads();
-
-                reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
-                __syncthreads();
-
-                float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);
-
-                // write dx, dy, |dx|, |dy|
-                if (tid == 0)
+                if (s > 1)
                {
-                    descriptors_block[0] = sdx[0];
-                    descriptors_block[1] = sdy[0];
-                    descriptors_block[2] = sdxabs[0];
-                    descriptors_block[3] = sdyabs[0];
+                    AreaFilter<WinReader> filter(win, s, s);
+                    s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd, xLoadInd);
                }
+                else
+                {
+                    LinearFilter<WinReader> filter(win);
+                    s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd * s, xLoadInd * s);
+                }
+            }
+
+            __syncthreads();
+
+            const int xPatchInd = threadIdx.x % 5;
+            const int yPatchInd = threadIdx.x / 5;
+
+            if (yPatchInd < 5)
+            {
+                const int xBlockInd = threadIdx.y % 4;
+                const int yBlockInd = threadIdx.y / 4;
+
+                const int xInd = xBlockInd * 5 + xPatchInd;
+                const int yInd = yBlockInd * 5 + yPatchInd;
+
+                const float dw = c_DW[yInd * PATCH_SZ + xInd];
+
+                dx = (s_PATCH[yInd    ][xInd + 1] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd + 1][xInd    ]) * dw;
+                dy = (s_PATCH[yInd + 1][xInd    ] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd    ][xInd + 1]) * dw;
            }
        }

-        __global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        __global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
        {
-            // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
-            __shared__ float sdx[25];
-            __shared__ float sdy[25];
+            __shared__ float smem[32 * 16];

-            // sum (reduce) 5x5 area response
-            __shared__ float sd1[25];
-            __shared__ float sd2[25];
-            __shared__ float sdabs1[25];
-            __shared__ float sdabs2[25];
+            float* sRow = smem + threadIdx.y * 32;

-            calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
-            __syncthreads();
+            float dx, dy;
+            calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);

-            const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+            float dxabs = ::fabsf(dx);
+            float dyabs = ::fabsf(dy);

-            if (tid < 25)
+            plus<float> op;
+
+            reduce<32>(sRow, dx, threadIdx.x, op);
+            reduce<32>(sRow, dy, threadIdx.x, op);
+            reduce<32>(sRow, dxabs, threadIdx.x, op);
+            reduce<32>(sRow, dyabs, threadIdx.x, op);
+
+            float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y;
+
+            // write dx, dy, |dx|, |dy|
+            if (threadIdx.x == 0)
+                *descriptors_block = make_float4(dx, dy, dxabs, dyabs);
+        }
+
+        __global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        {
+            __shared__ float smem[32 * 16];
+
+            float* sRow = smem + threadIdx.y * 32;
+
+            float dx, dy;
+            calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
+
+            float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;
+
+            plus<float> op;
+
+            float d1 = 0.0f;
+            float d2 = 0.0f;
+            float abs1 = 0.0f;
+            float abs2 = 0.0f;
+
+            if (dy >= 0)
            {
-                if (sdy[tid] >= 0)
-                {
-                    sd1[tid] = sdx[tid];
-                    sdabs1[tid] = ::fabs(sdx[tid]);
-                    sd2[tid] = 0;
-                    sdabs2[tid] = 0;
-                }
-                else
-                {
-                    sd1[tid] = 0;
-                    sdabs1[tid] = 0;
-                    sd2[tid] = sdx[tid];
-                    sdabs2[tid] = ::fabs(sdx[tid]);
-                }
-                __syncthreads();
-
-                reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-                __syncthreads();
-
-                float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
-
-                // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
-                if (tid == 0)
-                {
-                    descriptors_block[0] = sd1[0];
-                    descriptors_block[1] = sdabs1[0];
-                    descriptors_block[2] = sd2[0];
-                    descriptors_block[3] = sdabs2[0];
-                }
-                __syncthreads();
-
-                if (sdx[tid] >= 0)
-                {
-                    sd1[tid] = sdy[tid];
-                    sdabs1[tid] = ::fabs(sdy[tid]);
-                    sd2[tid] = 0;
-                    sdabs2[tid] = 0;
-                }
-                else
-                {
-                    sd1[tid] = 0;
-                    sdabs1[tid] = 0;
-                    sd2[tid] = sdy[tid];
-                    sdabs2[tid] = ::fabs(sdy[tid]);
-                }
-                __syncthreads();
-
-                reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-                __syncthreads();
-
-                // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
-                if (tid == 0)
-                {
-                    descriptors_block[4] = sd1[0];
-                    descriptors_block[5] = sdabs1[0];
-                    descriptors_block[6] = sd2[0];
-                    descriptors_block[7] = sdabs2[0];
-                }
+                d1 = dx;
+                abs1 = ::fabsf(dx);
            }
+            else
+            {
+                d2 = dx;
+                abs2 = ::fabsf(dx);
+            }
+
+            reduce<32>(sRow, d1, threadIdx.x, op);
+            reduce<32>(sRow, d2, threadIdx.x, op);
+            reduce<32>(sRow, abs1, threadIdx.x, op);
+            reduce<32>(sRow, abs2, threadIdx.x, op);
+
+            // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
+            if (threadIdx.x == 0)
+                descriptors_block[0] = make_float4(d1, abs1, d2, abs2);
+
+            if (dx >= 0)
+            {
+                d1 = dy;
+                abs1 = ::fabsf(dy);
+                d2 = 0.0f;
+                abs2 = 0.0f;
+            }
+            else
+            {
+                d1 = 0.0f;
+                abs1 = 0.0f;
+                d2 = dy;
+                abs2 = ::fabsf(dy);
+            }
+
+            reduce<32>(sRow, d1, threadIdx.x, op);
+            reduce<32>(sRow, d2, threadIdx.x, op);
+            reduce<32>(sRow, abs1, threadIdx.x, op);
+            reduce<32>(sRow, abs2, threadIdx.x, op);
+
+            // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
+            if (threadIdx.x == 0)
+                descriptors_block[1] = make_float4(d1, abs1, d2, abs2);
        }

        template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
        {
+            __shared__ float smem[BLOCK_DIM_X];
+            __shared__ float s_len;
+
            // no need for thread ID
            float* descriptor_base = descriptors.ptr(blockIdx.x);

            // read in the unnormalized descriptor values (squared)
-            __shared__ float sqDesc[BLOCK_DIM_X];
-            const float lookup = descriptor_base[threadIdx.x];
-            sqDesc[threadIdx.x] = lookup * lookup;
-            __syncthreads();
+            const float val = descriptor_base[threadIdx.x];

-            if (BLOCK_DIM_X >= 128)
-            {
-                if (threadIdx.x < 64)
-                    sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
-                __syncthreads();
-            }
+            float len = val * val;
+            reduce<BLOCK_DIM_X>(smem, len, threadIdx.x, plus<float>());

-            // reduction to get total
-            if (threadIdx.x < 32)
-            {
-                volatile float* smem = sqDesc;
-
-                smem[threadIdx.x] += smem[threadIdx.x + 32];
-                smem[threadIdx.x] += smem[threadIdx.x + 16];
-                smem[threadIdx.x] += smem[threadIdx.x + 8];
-                smem[threadIdx.x] += smem[threadIdx.x + 4];
-                smem[threadIdx.x] += smem[threadIdx.x + 2];
-                smem[threadIdx.x] += smem[threadIdx.x + 1];
-            }
-
-            // compute length (square root)
-            __shared__ float len;
            if (threadIdx.x == 0)
-            {
-                len = sqrtf(sqDesc[0]);
-            }
+                s_len = ::sqrtf(len);
+
            __syncthreads();

            // normalize and store in output
-            descriptor_base[threadIdx.x] = lookup / len;
+            descriptor_base[threadIdx.x] = val / s_len;
        }

-        void compute_descriptors_gpu(const PtrStepSzf& descriptors,
-            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
+        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
        {
            // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D

            if (descriptors.cols == 64)
            {
-                compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
+                compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
                cudaSafeCall( cudaGetLastError() );

                cudaSafeCall( cudaDeviceSynchronize() );

-                normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
+                normalize_descriptors<64><<<nFeatures, 64>>>((PtrStepSzf) descriptors);
                cudaSafeCall( cudaGetLastError() );

                cudaSafeCall( cudaDeviceSynchronize() );
            }
            else
            {
-                compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
+                compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
                cudaSafeCall( cudaGetLastError() );

                cudaSafeCall( cudaDeviceSynchronize() );

-                normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
+                normalize_descriptors<128><<<nFeatures, 128>>>((PtrStepSzf) descriptors);
                cudaSafeCall( cudaGetLastError() );

                cudaSafeCall( cudaDeviceSynchronize() );
--- a/modules/gpu/src/cuda/texture_binder.hpp
+++ b/modules/gpu/src/cuda/texture_binder.hpp
@ -85,7 +85,7 @@ namespace cv

  namespace device
  {
-      using pcl::gpu::TextureBinder;
+      using cv::gpu::TextureBinder;
  }
 }

--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace device

        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
            {
                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;

@ -158,7 +158,7 @@ namespace cv { namespace gpu { namespace device

        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, int)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, bool)
            {
                (void)xoff;
                (void)yoff;
@ -195,10 +195,10 @@ namespace cv { namespace gpu { namespace device
            }; \
            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, int cc) \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, bool cc20) \
                { \
                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc >= 20 ? 8 : 4); \
+                    dim3 block(32, cc20 ? 8 : 4); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
                    bindTexture(&tex_warp_ ## type , srcWhole); \
                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
@ -212,7 +212,7 @@ namespace cv { namespace gpu { namespace device
            }; \
            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, int) \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, bool) \
                { \
                    dim3 block(32, 8); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
@ -263,20 +263,20 @@ namespace cv { namespace gpu { namespace device

        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
            {
                if (stream == 0)
-                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc);
+                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc20);
                else
-                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc);
+                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc20);
            }
        };

        template <class Transform, typename T>
        void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
-                         int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+                         int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
        {
-            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
+            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);

            static const func_t funcs[3][5] =
            {
@ -304,84 +304,84 @@ namespace cv { namespace gpu { namespace device
            };

            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
        }

        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
        {
            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );

-            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
+            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
        }

-        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

        template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
        {
            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );

-            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
+            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
        }

-        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device

--- a/modules/gpu/src/cuvid_video_source.cpp
+++ b/modules/gpu/src/cuvid_video_source.cpp
@ -1,7 +1,7 @@
 #include "cuvid_video_source.h"
 #include "cu_safe_call.h"

-#if defined(HAVE_CUDA) && !defined(__APPLE__)
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)

 cv::gpu::detail::CuvidVideoSource::CuvidVideoSource(const std::string& fname)
 {
--- a/modules/gpu/src/cuvid_video_source.h
+++ b/modules/gpu/src/cuvid_video_source.h
@ -45,7 +45,7 @@

 #include "precomp.hpp"

-#if defined(HAVE_CUDA) && !defined(__APPLE__)
+#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)

 namespace cv { namespace gpu
 {
--- a/modules/gpu/src/cvt_color_internal.h
+++ b/modules/gpu/src/cvt_color_internal.h
@ -45,15 +45,19 @@

 namespace cv { namespace gpu { namespace device
 {
-#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name)                                   \
-    void name(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
+    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

 #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u)   \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)

-#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name)         \
+#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name)    \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)   \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
+
+#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(name)    \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)        \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)       \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u)   \
@ -152,46 +156,119 @@ namespace cv { namespace gpu { namespace device
    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)

-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)

-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)

-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls4)

-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls4)

-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgra)

    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL
 }}}

 #endif
--- a/modules/gpu/src/denoising.cpp
+++ b/modules/gpu/src/denoising.cpp
@ -176,28 +176,11 @@ void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat

 void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window, int block_window, Stream& s)
 {
-#if (CUDA_VERSION < 5000)
-    (void)src;
-    (void)dst;
-    (void)h_luminance;
-    (void)h_color;
-    (void)search_window;
-    (void)block_window;
-    (void)s;
-
-    CV_Error( CV_GpuApiCallError, "Lab method required CUDA 5.0 and higher" );
-#else
-
-
    CV_Assert(src.type() == CV_8UC3);

    lab.create(src.size(), src.type());
    cv::gpu::cvtColor(src, lab, CV_BGR2Lab, 0, s);

-    /*Mat t;
-    cv::cvtColor(Mat(src), t, CV_BGR2Lab);
-    lab.upload(t);*/
-
    l.create(src.size(), CV_8U);
    ab.create(src.size(), CV_8UC2);
    device::imgproc::fnlm_split_channels(lab, l, ab, StreamAccessor::getStream(s));
@ -207,11 +190,6 @@ void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat&

    device::imgproc::fnlm_merge_channels(l, ab, lab, StreamAccessor::getStream(s));
    cv::gpu::cvtColor(lab, dst, CV_Lab2BGR, 0, s);
-
-    /*cv::cvtColor(Mat(lab), t, CV_Lab2BGR);
-    dst.upload(t);*/
-
-#endif
 }


--- a/Show More
+++ b/Show More