optimizations:

- new reduce implementation (with kepler optimizations) - saturate_cast via asm command - video SIMD instructions in element operations - float arithmetics instead of double - new deviceSupports function
2025-01-18 06:03:15 +08:00 · 2013-01-23 14:43:36 +04:00 · 2013-01-23 14:43:36 +04:00 · 281d036fcf
commit 281d036fcf
parent ae6266e101
45 changed files with 9379 additions and 8846 deletions
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@ -79,6 +79,8 @@ namespace cv { namespace gpu
        WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
    };

+    CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
+
    // Gives information about what GPU archs this OpenCV GPU module was
    // compiled for
    class CV_EXPORTS TargetArchs
--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
@ -44,6 +44,7 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/type_traits.hpp"

 namespace cv { namespace gpu { namespace device
 {
@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device
    void writeScalar(const int*);
    void writeScalar(const float*);
    void writeScalar(const double*);
+    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
    void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
 }}}

@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device
    //////////////////////////////// ConvertTo ////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////

-    template <typename T, typename D> struct Convertor : unary_function<T, D>
+    template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
    {
-        Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
+        Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {}

-        __device__ __forceinline__ D operator()(const T& src) const
+        __device__ __forceinline__ D operator()(typename TypeTraits<T>::ParameterType src) const
        {
            return saturate_cast<D>(alpha * src + beta);
        }

-        double alpha, beta;
+        S alpha, beta;
    };

    namespace detail
@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device
        };
    }

-    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
+    template <typename T, typename D, typename S> struct TransformFunctorTraits< Convertor<T, D, S> > : detail::ConvertTraits< Convertor<T, D, S> >
    {
    };

-    template<typename T, typename D>
+    template<typename T, typename D, typename S>
    void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
    {
        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
-        Convertor<T, D> op(alpha, beta);
+        Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
        cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
    }

@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device
    {
        typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);

-        static const caller_t tab[8][8] =
+        static const caller_t tab[7][7] =
        {
-            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
-            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
-
-            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
-            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
-
-            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
-            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
-
-            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
-            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
-
-            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
-            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
-
-            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
-            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
-
-            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
-            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
-
-            {0,0,0,0,0,0,0,0}
+            {
+                cvt_<uchar, uchar, float>,
+                cvt_<uchar, schar, float>,
+                cvt_<uchar, ushort, float>,
+                cvt_<uchar, short, float>,
+                cvt_<uchar, int, float>,
+                cvt_<uchar, float, float>,
+                cvt_<uchar, double, double>
+            },
+            {
+                cvt_<schar, uchar, float>,
+                cvt_<schar, schar, float>,
+                cvt_<schar, ushort, float>,
+                cvt_<schar, short, float>,
+                cvt_<schar, int, float>,
+                cvt_<schar, float, float>,
+                cvt_<schar, double, double>
+            },
+            {
+                cvt_<ushort, uchar, float>,
+                cvt_<ushort, schar, float>,
+                cvt_<ushort, ushort, float>,
+                cvt_<ushort, short, float>,
+                cvt_<ushort, int, float>,
+                cvt_<ushort, float, float>,
+                cvt_<ushort, double, double>
+            },
+            {
+                cvt_<short, uchar, float>,
+                cvt_<short, schar, float>,
+                cvt_<short, ushort, float>,
+                cvt_<short, short, float>,
+                cvt_<short, int, float>,
+                cvt_<short, float, float>,
+                cvt_<short, double, double>
+            },
+            {
+                cvt_<int, uchar, float>,
+                cvt_<int, schar, float>,
+                cvt_<int, ushort, float>,
+                cvt_<int, short, float>,
+                cvt_<int, int, double>,
+                cvt_<int, float, double>,
+                cvt_<int, double, double>
+            },
+            {
+                cvt_<float, uchar, float>,
+                cvt_<float, schar, float>,
+                cvt_<float, ushort, float>,
+                cvt_<float, short, float>,
+                cvt_<float, int, float>,
+                cvt_<float, float, float>,
+                cvt_<float, double, double>
+            },
+            {
+                cvt_<double, uchar, double>,
+                cvt_<double, schar, double>,
+                cvt_<double, ushort, double>,
+                cvt_<double, short, double>,
+                cvt_<double, int, double>,
+                cvt_<double, float, double>,
+                cvt_<double, double, double>
+            }
        };

        caller_t func = tab[sdepth][ddepth];
-        if (!func)
-            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu");
-
        func(src, dst, alpha, beta, stream);
    }

--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@ -45,8 +45,7 @@
 #include <iostream>

 #ifdef HAVE_CUDA
-    #include <cuda.h>
-    #include <cuda_runtime_api.h>
+    #include <cuda_runtime.h>
    #include <npp.h>

    #define CUDART_MINIMUM_REQUIRED_VERSION 4010
@ -69,33 +68,89 @@ using namespace cv::gpu;

 namespace
 {
-    // Compares value to set using the given comparator. Returns true if
-    // there is at least one element x in the set satisfying to: x cmp value
-    // predicate.
-    template <typename Comparer>
-    bool compareToSet(const std::string& set_as_str, int value, Comparer cmp)
+    class CudaArch
+    {
+    public:
+        CudaArch();
+
+        bool builtWith(FeatureSet feature_set) const;
+        bool hasPtx(int major, int minor) const;
+        bool hasBin(int major, int minor) const;
+        bool hasEqualOrLessPtx(int major, int minor) const;
+        bool hasEqualOrGreaterPtx(int major, int minor) const;
+        bool hasEqualOrGreaterBin(int major, int minor) const;
+
+    private:
+        static void fromStr(const string& set_as_str, vector<int>& arr);
+
+        vector<int> bin;
+        vector<int> ptx;
+        vector<int> features;
+    };
+
+    const CudaArch cudaArch;
+
+    CudaArch::CudaArch()
+    {
+    #ifdef HAVE_CUDA
+        fromStr(CUDA_ARCH_BIN, bin);
+        fromStr(CUDA_ARCH_PTX, ptx);
+        fromStr(CUDA_ARCH_FEATURES, features);
+    #endif
+    }
+
+    bool CudaArch::builtWith(FeatureSet feature_set) const
+    {
+        return !features.empty() && (features.back() >= feature_set);
+    }
+
+    bool CudaArch::hasPtx(int major, int minor) const
+    {
+        return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
+    }
+
+    bool CudaArch::hasBin(int major, int minor) const
+    {
+        return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
+    }
+
+    bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.front() <= major * 10 + minor);
+    }
+
+    bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.back() >= major * 10 + minor);
+    }
+
+    bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
+    {
+        return !bin.empty() && (bin.back() >= major * 10 + minor);
+    }
+
+    void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
    {
        if (set_as_str.find_first_not_of(" ") == string::npos)
-            return false;
+            return;

-        std::stringstream stream(set_as_str);
+        istringstream stream(set_as_str);
        int cur_value;

        while (!stream.eof())
        {
            stream >> cur_value;
-            if (cmp(cur_value, value))
-                return true;
+            arr.push_back(cur_value);
        }

-        return false;
+        sort(arr.begin(), arr.end());
    }
 }

 bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_FEATURES, feature_set, std::greater_equal<int>());
+    return cudaArch.builtWith(feature_set);
 #else
    (void)feature_set;
    return false;
@ -110,7 +165,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor)
 bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::equal_to<int>());
+    return cudaArch.hasPtx(major, minor);
 #else
    (void)major;
    (void)minor;
@ -121,7 +176,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
 bool cv::gpu::TargetArchs::hasBin(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, std::equal_to<int>());
+    return cudaArch.hasBin(major, minor);
 #else
    (void)major;
    (void)minor;
@ -132,8 +187,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor)
 bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor,
-                     std::less_equal<int>());
+    return cudaArch.hasEqualOrLessPtx(major, minor);
 #else
    (void)major;
    (void)minor;
@ -143,14 +197,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)

 bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
 {
-    return hasEqualOrGreaterPtx(major, minor) ||
-           hasEqualOrGreaterBin(major, minor);
+    return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
 }

 bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::greater_equal<int>());
+    return cudaArch.hasEqualOrGreaterPtx(major, minor);
 #else
    (void)major;
    (void)minor;
@ -161,8 +214,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
 bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
 {
 #if defined (HAVE_CUDA)
-    return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor,
-                     std::greater_equal<int>());
+    return cudaArch.hasEqualOrGreaterBin(major, minor);
 #else
    (void)major;
    (void)minor;
@ -170,6 +222,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
 #endif
 }

+bool cv::gpu::deviceSupports(FeatureSet feature_set)
+{
+    static int versions[] =
+    {
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+    };
+    static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
+
+    const int devId = getDevice();
+
+    int version;
+
+    if (devId < cache_size && versions[devId] >= 0)
+        version = versions[devId];
+    else
+    {
+        DeviceInfo dev(devId);
+        version = dev.majorVersion() * 10 + dev.minorVersion();
+        if (devId < cache_size)
+            versions[devId] = version;
+    }
+
+    return TargetArchs::builtWith(feature_set) && (version >= feature_set);
+}
+
 #if !defined (HAVE_CUDA)

 #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
@ -0,0 +1,361 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__
+#define __OPENCV_GPU_REDUCE_DETAIL_HPP__
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace reduce_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val);
+
+                For<I + 1, N>::loadToSmem(smem, val, tid);
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(val) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, val, tid);
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::merge(smem, val, tid, delta, op);
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
+                thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::mergeShfl(val, delta, width, op);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+        };
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            smem[tid] = val;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            val = smem[tid];
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                       const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                       unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                         const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                         unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
+        }
+
+        template <typename T, class Op>
+        __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
+        {
+            T reg = smem[tid + delta];
+            smem[tid] = val = op(val, reg);
+        }
+        template <typename T, class Op>
+        __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
+        {
+            T reg = shfl_down(val, delta, width);
+            val = op(val, reg);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                              const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                              unsigned int tid,
+                                              unsigned int delta,
+                                              const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
+        }
+        template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                  unsigned int delta,
+                                                  unsigned int width,
+                                                  const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
+        }
+
+        template <unsigned int N> struct Generic
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                loadToSmem(smem, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(smem, val, tid, 1024, op);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(smem, val, tid, 512, op);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(smem, val, tid, 256, op);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(smem, val, tid, 128, op);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(smem, val, tid, 64, op);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(smem, val, tid, 32, op);
+                }
+
+                if (tid < 16)
+                {
+                    merge(smem, val, tid, 16, op);
+                    merge(smem, val, tid, 8, op);
+                    merge(smem, val, tid, 4, op);
+                    merge(smem, val, tid, 2, op);
+                    merge(smem, val, tid, 1, op);
+                }
+            }
+        };
+
+        template <unsigned int I, typename Pointer, typename Reference, class Op>
+        struct Unroll
+        {
+            static __device__ void loopShfl(Reference val, Op op, unsigned int N)
+            {
+                mergeShfl(val, I, N, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            }
+            static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                merge(smem, val, tid, I, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            }
+        };
+        template <typename Pointer, typename Reference, class Op>
+        struct Unroll<0, Pointer, Reference, Op>
+        {
+            static __device__ void loopShfl(Reference, Op, unsigned int)
+            {
+            }
+            static __device__ void loop(Pointer, Reference, unsigned int, Op)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+            #if __CUDA_ARCH__ >= 300
+                (void) smem;
+                (void) tid;
+
+                Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if __CUDA_ARCH__ >= 300
+                Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(smem, val, tid);
+
+                if (tid < 32)
+                {
+                #if __CUDA_ARCH__ >= 300
+                    Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
+                #else
+                    Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
@ -0,0 +1,498 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
+#define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace reduce_key_val_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(data);
+
+                For<I + 1, N>::loadToSmem(smem, data, tid);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(data) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, data, tid);
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
+            {
+                thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+
+                For<I + 1, N>::copyShfl(val, delta, width);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
+            {
+                thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+
+                For<I + 1, N>::copy(svals, val, tid, delta);
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(key) = reg;
+                    thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+                }
+
+                For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
+                                         const ValPointerTuple& svals, const ValReferenceTuple& val,
+                                         const CmpTuple& cmp,
+                                         unsigned int tid, unsigned int delta)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
+                    thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+                }
+
+                For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
+            {
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
+            {
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
+                                         const ValPointerTuple&, const ValReferenceTuple&,
+                                         const CmpTuple&,
+                                         unsigned int, unsigned int)
+            {
+            }
+        };
+
+        //////////////////////////////////////////////////////
+        // loadToSmem
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            smem[tid] = data;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            data = smem[tid];
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                   const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                   unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                     const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                     unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
+        }
+
+        //////////////////////////////////////////////////////
+        // copyVals
+
+        template <typename V>
+        __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
+        {
+            val = shfl_down(val, delta, width);
+        }
+        template <typename V>
+        __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
+        {
+            svals[tid] = val = svals[tid + delta];
+        }
+        template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                     unsigned int delta,
+                                                     int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
+        }
+
+        //////////////////////////////////////////////////////
+        // merge
+
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename K,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const Cmp& cmp,
+                                                  unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                                  unsigned int delta, int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
+        }
+        template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+                  typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                              const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                              unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
+        }
+
+        //////////////////////////////////////////////////////
+        // Generic
+
+        template <unsigned int N> struct Generic
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                loadToSmem(skeys, key, tid);
+                loadValsToSmem(svals, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(skeys, key, svals, val, cmp, tid, 1024);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(skeys, key, svals, val, cmp, tid, 512);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(skeys, key, svals, val, cmp, tid, 256);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(skeys, key, svals, val, cmp, tid, 128);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(skeys, key, svals, val, cmp, tid, 64);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(skeys, key, svals, val, cmp, tid, 32);
+                }
+
+                if (tid < 16)
+                {
+                    merge(skeys, key, svals, val, cmp, tid, 16);
+                    merge(skeys, key, svals, val, cmp, tid, 8);
+                    merge(skeys, key, svals, val, cmp, tid, 4);
+                    merge(skeys, key, svals, val, cmp, tid, 2);
+                    merge(skeys, key, svals, val, cmp, tid, 1);
+                }
+            }
+        };
+
+        template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll
+        {
+            static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
+            {
+                mergeShfl(key, val, cmp, I, N);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            }
+            static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                merge(skeys, key, svals, val, cmp, tid, I);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            }
+        };
+        template <class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll<0, KP, KR, VP, VR, Cmp>
+        {
+            static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
+            {
+            }
+            static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+            #if 0 // __CUDA_ARCH__ >= 300
+                (void) skeys;
+                (void) svals;
+                (void) tid;
+
+                Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if 0 // __CUDA_ARCH__ >= 300
+                Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(skeys, key, tid);
+
+                if (tid < 32)
+                {
+                #if 0 // __CUDA_ARCH__ >= 300
+                    loadFromSmem(svals, val, tid);
+
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
+                #else
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+#endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp
@ -1,841 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_REDUCTION_DETAIL_HPP__
-#define __OPENCV_GPU_REDUCTION_DETAIL_HPP__
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace utility_detail
-    {
-        ///////////////////////////////////////////////////////////////////////////////
-        // Reductor
-
-        template <int n> struct WarpReductor
-        {
-            template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                if (tid < n)
-                    data[tid] = partial_reduction;
-                if (n > 32) __syncthreads();
-
-                if (n > 32)
-                {
-                    if (tid < n - 32)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
-                    if (tid < 16)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-                else if (n > 16)
-                {
-                    if (tid < n - 16)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                    if (tid < 8)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-                else if (n > 8)
-                {
-                    if (tid < n - 8)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                    if (tid < 4)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-                else if (n > 4)
-                {
-                    if (tid < n - 4)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                    if (tid < 2)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-                else if (n > 2)
-                {
-                    if (tid < n - 2)
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                    if (tid < 2)
-                    {
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                    }
-                }
-            }
-        };
-        template <> struct WarpReductor<64>
-        {
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                data[tid] = partial_reduction;
-                __syncthreads();
-
-                if (tid < 32)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
-                }
-            }
-        };
-        template <> struct WarpReductor<32>
-        {
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                data[tid] = partial_reduction;
-
-                if (tid < 16)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
-                }
-            }
-        };
-        template <> struct WarpReductor<16>
-        {
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                data[tid] = partial_reduction;
-
-                if (tid < 8)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
-                }
-            }
-        };
-        template <> struct WarpReductor<8>
-        {
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                data[tid] = partial_reduction;
-
-                if (tid < 4)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
-                }
-            }
-        };
-
-        template <bool warp> struct ReductionDispatcher;
-        template <> struct ReductionDispatcher<true>
-        {
-            template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                WarpReductor<n>::reduce(data, partial_reduction, tid, op);
-            }
-        };
-        template <> struct ReductionDispatcher<false>
-        {
-            template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-            {
-                if (tid < n)
-                    data[tid] = partial_reduction;
-                __syncthreads();
-
-
-                if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }
-                if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }
-                if (n >= 128) { if (tid <  64) { data[tid] = partial_reduction = op(partial_reduction, data[tid +  64]); } __syncthreads(); }
-
-                if (tid < 32)
-                {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                }
-            }
-        };
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // PredValWarpReductor
-
-        template <int n> struct PredValWarpReductor;
-        template <> struct PredValWarpReductor<64>
-        {
-            template <typename T, typename V, typename Pred>
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                if (tid < 32)
-                {
-                    myData = sdata[tid];
-                    myVal = sval[tid];
-
-                    T reg = sdata[tid + 32];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 32];
-                    }
-
-                    reg = sdata[tid + 16];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 16];
-                    }
-
-                    reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 1];
-                    }
-                }
-            }
-        };
-        template <> struct PredValWarpReductor<32>
-        {
-            template <typename T, typename V, typename Pred>
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                if (tid < 16)
-                {
-                    myData = sdata[tid];
-                    myVal = sval[tid];
-
-                    T reg = sdata[tid + 16];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 16];
-                    }
-
-                    reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 1];
-                    }
-                }
-            }
-        };
-
-        template <> struct PredValWarpReductor<16>
-        {
-            template <typename T, typename V, typename Pred>
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                if (tid < 8)
-                {
-                    myData = sdata[tid];
-                    myVal = sval[tid];
-
-                    T reg = reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 1];
-                    }
-                }
-            }
-        };
-        template <> struct PredValWarpReductor<8>
-        {
-            template <typename T, typename V, typename Pred>
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                if (tid < 4)
-                {
-                    myData = sdata[tid];
-                    myVal = sval[tid];
-
-                    T reg = reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 1];
-                    }
-                }
-            }
-        };
-
-        template <bool warp> struct PredValReductionDispatcher;
-        template <> struct PredValReductionDispatcher<true>
-        {
-            template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);
-            }
-        };
-        template <> struct PredValReductionDispatcher<false>
-        {
-            template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-            {
-                myData = sdata[tid];
-                myVal = sval[tid];
-
-                if (n >= 512 && tid < 256)
-                {
-                    T reg = sdata[tid + 256];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 256];
-                    }
-                    __syncthreads();
-                }
-                if (n >= 256 && tid < 128)
-                {
-                    T reg = sdata[tid + 128];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 128];
-                    }
-                    __syncthreads();
-                }
-                if (n >= 128 && tid < 64)
-                {
-                    T reg = sdata[tid + 64];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval[tid] = myVal = sval[tid + 64];
-                    }
-                    __syncthreads();
-                }
-
-                if (tid < 32)
-                {
-                    if (n >= 64)
-                    {
-                        T reg = sdata[tid + 32];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 32];
-                        }
-                    }
-                    if (n >= 32)
-                    {
-                        T reg = sdata[tid + 16];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 16];
-                        }
-                    }
-                    if (n >= 16)
-                    {
-                        T reg = sdata[tid + 8];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 8];
-                        }
-                    }
-                    if (n >= 8)
-                    {
-                        T reg = sdata[tid + 4];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 4];
-                        }
-                    }
-                    if (n >= 4)
-                    {
-                        T reg = sdata[tid + 2];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 2];
-                        }
-                    }
-                    if (n >= 2)
-                    {
-                        T reg = sdata[tid + 1];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval[tid] = myVal = sval[tid + 1];
-                        }
-                    }
-                }
-            }
-        };
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // PredVal2WarpReductor
-
-        template <int n> struct PredVal2WarpReductor;
-        template <> struct PredVal2WarpReductor<64>
-        {
-            template <typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                if (tid < 32)
-                {
-                    myData = sdata[tid];
-                    myVal1 = sval1[tid];
-                    myVal2 = sval2[tid];
-
-                    T reg = sdata[tid + 32];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 32];
-                        sval2[tid] = myVal2 = sval2[tid + 32];
-                    }
-
-                    reg = sdata[tid + 16];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 16];
-                        sval2[tid] = myVal2 = sval2[tid + 16];
-                    }
-
-                    reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 8];
-                        sval2[tid] = myVal2 = sval2[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 4];
-                        sval2[tid] = myVal2 = sval2[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 2];
-                        sval2[tid] = myVal2 = sval2[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 1];
-                        sval2[tid] = myVal2 = sval2[tid + 1];
-                    }
-                }
-            }
-        };
-        template <> struct PredVal2WarpReductor<32>
-        {
-            template <typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                if (tid < 16)
-                {
-                    myData = sdata[tid];
-                    myVal1 = sval1[tid];
-                    myVal2 = sval2[tid];
-
-                    T reg = sdata[tid + 16];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 16];
-                        sval2[tid] = myVal2 = sval2[tid + 16];
-                    }
-
-                    reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 8];
-                        sval2[tid] = myVal2 = sval2[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 4];
-                        sval2[tid] = myVal2 = sval2[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 2];
-                        sval2[tid] = myVal2 = sval2[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 1];
-                        sval2[tid] = myVal2 = sval2[tid + 1];
-                    }
-                }
-            }
-        };
-
-        template <> struct PredVal2WarpReductor<16>
-        {
-            template <typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                if (tid < 8)
-                {
-                    myData = sdata[tid];
-                    myVal1 = sval1[tid];
-                    myVal2 = sval2[tid];
-
-                    T reg = reg = sdata[tid + 8];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 8];
-                        sval2[tid] = myVal2 = sval2[tid + 8];
-                    }
-
-                    reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 4];
-                        sval2[tid] = myVal2 = sval2[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 2];
-                        sval2[tid] = myVal2 = sval2[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 1];
-                        sval2[tid] = myVal2 = sval2[tid + 1];
-                    }
-                }
-            }
-        };
-        template <> struct PredVal2WarpReductor<8>
-        {
-            template <typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                if (tid < 4)
-                {
-                    myData = sdata[tid];
-                    myVal1 = sval1[tid];
-                    myVal2 = sval2[tid];
-
-                    T reg = reg = sdata[tid + 4];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 4];
-                        sval2[tid] = myVal2 = sval2[tid + 4];
-                    }
-
-                    reg = sdata[tid + 2];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 2];
-                        sval2[tid] = myVal2 = sval2[tid + 2];
-                    }
-
-                    reg = sdata[tid + 1];
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 1];
-                        sval2[tid] = myVal2 = sval2[tid + 1];
-                    }
-                }
-            }
-        };
-
-        template <bool warp> struct PredVal2ReductionDispatcher;
-        template <> struct PredVal2ReductionDispatcher<true>
-        {
-            template <int n, typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
-            }
-        };
-        template <> struct PredVal2ReductionDispatcher<false>
-        {
-            template <int n, typename T, typename V1, typename V2, typename Pred>
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-            {
-                myData = sdata[tid];
-                myVal1 = sval1[tid];
-                myVal2 = sval2[tid];
-
-                if (n >= 512 && tid < 256)
-                {
-                    T reg = sdata[tid + 256];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 256];
-                        sval2[tid] = myVal2 = sval2[tid + 256];
-                    }
-                    __syncthreads();
-                }
-                if (n >= 256 && tid < 128)
-                {
-                    T reg = sdata[tid + 128];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 128];
-                        sval2[tid] = myVal2 = sval2[tid + 128];
-                    }
-                    __syncthreads();
-                }
-                if (n >= 128 && tid < 64)
-                {
-                    T reg = sdata[tid + 64];
-
-                    if (pred(reg, myData))
-                    {
-                        sdata[tid] = myData = reg;
-                        sval1[tid] = myVal1 = sval1[tid + 64];
-                        sval2[tid] = myVal2 = sval2[tid + 64];
-                    }
-                    __syncthreads();
-                }
-
-                if (tid < 32)
-                {
-                    if (n >= 64)
-                    {
-                        T reg = sdata[tid + 32];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 32];
-                            sval2[tid] = myVal2 = sval2[tid + 32];
-                        }
-                    }
-                    if (n >= 32)
-                    {
-                        T reg = sdata[tid + 16];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 16];
-                            sval2[tid] = myVal2 = sval2[tid + 16];
-                        }
-                    }
-                    if (n >= 16)
-                    {
-                        T reg = sdata[tid + 8];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 8];
-                            sval2[tid] = myVal2 = sval2[tid + 8];
-                        }
-                    }
-                    if (n >= 8)
-                    {
-                        T reg = sdata[tid + 4];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 4];
-                            sval2[tid] = myVal2 = sval2[tid + 4];
-                        }
-                    }
-                    if (n >= 4)
-                    {
-                        T reg = sdata[tid + 2];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 2];
-                            sval2[tid] = myVal2 = sval2[tid + 2];
-                        }
-                    }
-                    if (n >= 2)
-                    {
-                        T reg = sdata[tid + 1];
-
-                        if (pred(reg, myData))
-                        {
-                            sdata[tid] = myData = reg;
-                            sval1[tid] = myVal1 = sval1[tid + 1];
-                            sval2[tid] = myVal2 = sval2[tid + 1];
-                        }
-                    }
-                }
-            }
-        };
-    } // namespace utility_detail
-}}} // namespace cv { namespace gpu { namespace device
-
-#endif // __OPENCV_GPU_REDUCTION_DETAIL_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/reduce.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/reduce.hpp
@ -0,0 +1,197 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_REDUCE_HPP__
+#define __OPENCV_GPU_REDUCE_HPP__
+
+#include <thrust/tuple.h>
+#include "detail/reduce.hpp"
+#include "detail/reduce_key_val.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    template <int N, typename T, class Op>
+    __device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
+    }
+    template <int N,
+              typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+              typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+              class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+    __device__ __forceinline__ void reduce(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                           const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                           unsigned int tid,
+                                           const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<
+                const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>&,
+                const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&,
+                const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op);
+    }
+
+    template <unsigned int N, typename K, typename V, class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+    template <unsigned int N,
+              typename K,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key,
+                                                 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&,
+                const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+                const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+                const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+    template <unsigned int N,
+              typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+              typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+    __device__ __forceinline__ void reduceKeyVal(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                                 const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid,
+                                                 const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<
+                const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>&,
+                const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&,
+                const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+                const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+                const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>&
+                >(skeys, key, svals, val, tid, cmp);
+    }
+
+    // smem_tuple
+
+    template <typename T0>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*>
+    smem_tuple(T0* t0)
+    {
+        return thrust::make_tuple((volatile T0*) t0);
+    }
+
+    template <typename T0, typename T1>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*>
+    smem_tuple(T0* t0, T1* t1)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1);
+    }
+
+    template <typename T0, typename T1, typename T2>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*>
+    smem_tuple(T0* t0, T1* t1, T2* t2)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*, volatile T9*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9);
+    }
+}}}
+
+#endif // __OPENCV_GPU_UTILITY_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp
@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device

    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
    {
-        return (uchar) ::max((int)v, 0);
-    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
-    {
-        return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
-    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
-    {
-        return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
-    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
-    {
-        return (uchar) ::min(v, (uint)UCHAR_MAX);
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
    {
-        return saturate_cast<uchar>((uint)v);
+        uint res = 0;
+        asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
    }
-
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
    {
-        int iv = __float2int_rn(v);
-        return saturate_cast<uchar>(iv);
+        uint res = 0;
+        asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<uchar>(iv);
+    #if __CUDA_ARCH__ >= 130
+        uint res = 0;
+        asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<uchar>((float)v);
    #endif
@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device

    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
    {
-        return (schar) ::min((int)v, SCHAR_MAX);
-    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
-    {
-        return (schar) ::min((uint)v, (uint)SCHAR_MAX);
-    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
-    {
-        return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
+        uint res = 0;
+        uint vi = v;
+        asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
    {
-        return saturate_cast<schar>((int)v);
+        uint res = 0;
+        asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
    {
-        return (schar) ::min(v, (uint)SCHAR_MAX);
+        uint res = 0;
+        asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
    }
-
    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
    {
-        int iv = __float2int_rn(v);
-        return saturate_cast<schar>(iv);
+        uint res = 0;
+        asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<schar>(iv);
+    #if __CUDA_ARCH__ >= 130
+        uint res = 0;
+        asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<schar>((float)v);
    #endif
@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device

    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
    {
-        return (ushort) ::max((int)v, 0);
+        ushort res = 0;
+        int vi = v;
+        asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
    {
-        return (ushort) ::max((int)v, 0);
+        ushort res = 0;
+        asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
    {
-        return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
+        ushort res = 0;
+        asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
    {
-        return (ushort) ::min(v, (uint)USHRT_MAX);
+        ushort res = 0;
+        asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
    {
-        int iv = __float2int_rn(v);
-        return saturate_cast<ushort>(iv);
+        ushort res = 0;
+        asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<ushort>(iv);
+    #if __CUDA_ARCH__ >= 130
+        ushort res = 0;
+        asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<ushort>((float)v);
    #endif
@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device

    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
    {
-        return (short) ::min((int)v, SHRT_MAX);
+        short res = 0;
+        asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(int v)
    {
-        return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
+        short res = 0;
+        asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
    {
-        return (short) ::min(v, (uint)SHRT_MAX);
+        short res = 0;
+        asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(float v)
    {
-        int iv = __float2int_rn(v);
-        return saturate_cast<short>(iv);
+        short res = 0;
+        asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(double v)
    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<short>(iv);
+    #if __CUDA_ARCH__ >= 130
+        short res = 0;
+        asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
    #else
        return saturate_cast<short>((float)v);
    #endif
    }

+    template<> __device__ __forceinline__ int saturate_cast<int>(uint v)
+    {
+        int res = 0;
+        asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
    template<> __device__ __forceinline__ int saturate_cast<int>(float v)
    {
        return __float2int_rn(v);
@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device
    #endif
    }

+    template<> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
+    {
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
    {
        return __float2uint_rn(v);
--- a/modules/gpu/include/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp
@ -45,7 +45,6 @@

 #include "saturate_cast.hpp"
 #include "datamov_utils.hpp"
-#include "detail/reduction_detail.hpp"

 namespace cv { namespace gpu { namespace device
 {
@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device
        }
    };

-    ///////////////////////////////////////////////////////////////////////////////
-    // Reduction
-
-    template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
-    }
-
-    template <int n, typename T, typename V, typename Pred>
-    __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
-    }
-
-    template <int n, typename T, typename V1, typename V2, typename Pred>
-    __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
-    }
-
    ///////////////////////////////////////////////////////////////////////////////
    // Solve linear system

--- a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
 #define __OPENCV_GPU_VEC_DISTANCE_HPP__

-#include "utility.hpp"
+#include "reduce.hpp"
 #include "functional.hpp"
 #include "detail/vec_distance_detail.hpp"

@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device

        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
        }

        __device__ __forceinline__ operator int() const
@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device

        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
        }

        __device__ __forceinline__ operator float() const
@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device

        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
        }

        __device__ __forceinline__ operator float() const
@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device

        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
        }

        __device__ __forceinline__ operator int() const
--- a/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
@ -0,0 +1,145 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
+#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T>
+    __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl(val, srcLane, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl((int) val, srcLane, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl(lo, srcLane, width);
+        hi = __shfl(hi, srcLane, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl_down(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_down((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl_down(lo, delta, width);
+        hi = __shfl_down(hi, delta, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl_up(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_up((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl_up(lo, delta, width);
+        hi = __shfl_up(hi, delta, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+}}}
+
+#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@ -68,11 +68,16 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool,
 void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
 {
 #ifndef HAVE_CUBLAS
-    (void)src1; (void)src2; (void)alpha; (void)src3; (void)beta; (void)dst; (void)flags; (void)stream;
+    (void)src1;
+    (void)src2;
+    (void)alpha;
+    (void)src3;
+    (void)beta;
+    (void)dst;
+    (void)flags;
+    (void)stream;
    CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS");
-
 #else
-
    // CUBLAS works with column-major matrices

    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
@ -80,7 +85,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G

    if (src1.depth() == CV_64F)
    {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }

@ -188,7 +193,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
    }

    cublasSafeCall( cublasDestroy_v2(handle) );
-
 #endif
 }

@ -227,7 +231,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
    }
    else // if (src.elemSize() == 8)
    {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");

        NppStStreamHandler h(stream);
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@ -88,71 +88,71 @@ namespace cv { namespace gpu { namespace device
    {
        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
    }

    namespace bf_knnmatch
    {
        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);

        template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
    }

    namespace bf_radius_match
    {
        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream);
+            cudaStream_t stream);
    }
 }}}

@ -198,11 +198,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
    if (query.empty() || train.empty())
        return;

-    using namespace ::cv::gpu::device::bf_match;
+    using namespace cv::gpu::device::bf_match;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
@ -234,10 +234,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    func(query, train, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream));
+    func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream));
 }

 void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector<DMatch>& matches)
@ -340,11 +337,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
    if (query.empty() || trainCollection.empty())
        return;

-    using namespace ::cv::gpu::device::bf_match;
+    using namespace cv::gpu::device::bf_match;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
@ -376,10 +373,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    func(query, trainCollection, masks, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
+    func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
 }

 void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
@ -451,11 +445,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
    if (query.empty() || train.empty())
        return;

-    using namespace ::cv::gpu::device::bf_knnmatch;
+    using namespace cv::gpu::device::bf_knnmatch;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
                             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
@ -502,10 +496,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream));
+    func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
 }

 void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
@ -580,11 +571,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
    if (query.empty() || trainCollection.empty())
        return;

-    using namespace ::cv::gpu::device::bf_knnmatch;
+    using namespace cv::gpu::device::bf_knnmatch;

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
@ -621,10 +612,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
+    func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
 }

 void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
@ -765,7 +753,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
@ -786,12 +774,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
        }
    };

-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
    const int nQuery = query.rows;
    const int nTrain = train.rows;

@ -814,7 +796,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);

-    func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
+    func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream));
 }

 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
@ -897,7 +879,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu

    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream);
+                             cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
@ -918,12 +900,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
        }
    };

-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
-    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
    const int nQuery = query.rows;

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
@ -949,7 +925,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
    vector<PtrStepSzb> masks_(masks.begin(), masks.end());

    func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
-        trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
+        trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
 }

 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@ -42,10 +42,13 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"

 namespace cv { namespace gpu { namespace device
 {
@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device
                                      int& bestTrainIdx1, int& bestTrainIdx2,
                                      float* s_distance, int* s_trainIdx)
        {
+        #if __CUDA_ARCH__ >= 300
+            (void) s_distance;
+            (void) s_trainIdx;
+
+            float d1, d2;
+            int i1, i2;
+
+            #pragma unroll
+            for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+            {
+                d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+                d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+                i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+                i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+
+                if (bestDistance1 < d1)
+                {
+                    if (d1 < bestDistance2)
+                    {
+                        bestDistance2 = d1;
+                        bestTrainIdx2 = i1;
+                    }
+                }
+                else
+                {
+                    bestDistance2 = bestDistance1;
+                    bestTrainIdx2 = bestTrainIdx1;
+
+                    bestDistance1 = d1;
+                    bestTrainIdx1 = i1;
+
+                    if (d2 < bestDistance2)
+                    {
+                        bestDistance2 = d2;
+                        bestTrainIdx2 = i2;
+                    }
+                }
+            }
+        #else
            float myBestDistance1 = numeric_limits<float>::max();
            float myBestDistance2 = numeric_limits<float>::max();
            int myBestTrainIdx1 = -1;
@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device

            bestTrainIdx1 = myBestTrainIdx1;
            bestTrainIdx2 = myBestTrainIdx2;
+        #endif
        }

        template <int BLOCK_SIZE>
@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device
                                       int& bestImgIdx1, int& bestImgIdx2,
                                       float* s_distance, int* s_trainIdx, int* s_imgIdx)
        {
+        #if __CUDA_ARCH__ >= 300
+            (void) s_distance;
+            (void) s_trainIdx;
+            (void) s_imgIdx;
+
+            float d1, d2;
+            int i1, i2;
+            int j1, j2;
+
+            #pragma unroll
+            for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+            {
+                d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+                d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+                i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+                i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+                j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
+                j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
+
+                if (bestDistance1 < d1)
+                {
+                    if (d1 < bestDistance2)
+                    {
+                        bestDistance2 = d1;
+                        bestTrainIdx2 = i1;
+                        bestImgIdx2 = j1;
+                    }
+                }
+                else
+                {
+                    bestDistance2 = bestDistance1;
+                    bestTrainIdx2 = bestTrainIdx1;
+                    bestImgIdx2 = bestImgIdx1;
+
+                    bestDistance1 = d1;
+                    bestTrainIdx1 = i1;
+                    bestImgIdx1 = j1;
+
+                    if (d2 < bestDistance2)
+                    {
+                        bestDistance2 = d2;
+                        bestTrainIdx2 = i2;
+                        bestImgIdx2 = j2;
+                    }
+                }
+            }
+        #else
            float myBestDistance1 = numeric_limits<float>::max();
            float myBestDistance2 = numeric_limits<float>::max();
            int myBestTrainIdx1 = -1;
@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device

            bestImgIdx1 = myBestImgIdx1;
            bestImgIdx2 = myBestImgIdx2;
+        #endif
        }

        ///////////////////////////////////////////////////////////////////////////////
@ -748,9 +839,8 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T, typename Mask>
        void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
                              const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
-                              int cc, cudaStream_t stream)
+                              cudaStream_t stream)
        {
-            (void)cc;
            if (query.cols <= 64)
            {
                matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
@ -780,9 +870,8 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T, typename Mask>
        void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
                              const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-                              int cc, cudaStream_t stream)
+                              cudaStream_t stream)
        {
-            (void)cc;
            if (query.cols <= 64)
            {
                matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
@ -945,9 +1034,8 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T, typename Mask>
        void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
                                    const PtrStepSzf& allDist,
-                                    int cc, cudaStream_t stream)
+                                    cudaStream_t stream)
        {
-            (void)cc;
            if (query.cols <= 64)
            {
                calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
@ -1005,7 +1093,7 @@ namespace cv { namespace gpu { namespace device
            s_trainIdx[threadIdx.x] = bestIdx;
            __syncthreads();

-            reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
+            reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());

            if (threadIdx.x == 0)
            {
@ -1034,7 +1122,7 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream)
+        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream)
        {
            findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
        }
@ -1045,16 +1133,16 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T, typename Mask>
        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (k == 2)
            {
-                match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
+                match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, stream);
            }
            else
            {
-                calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
-                findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
+                calcDistanceDispatcher<Dist>(query, train, mask, allDist, stream);
+                findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
            }
        }

@ -1063,103 +1151,103 @@ namespace cv { namespace gpu { namespace device

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (mask.data)
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
            else
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
        }

-        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (mask.data)
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
            else
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
        }

-        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (mask.data)
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
            else
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
        }

-        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);

        template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (masks.data)
-                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
            else
-                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance,  stream);
        }

-        template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);

        template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (masks.data)
-                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
            else
-                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
        }

-        //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);

        template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (masks.data)
-                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
            else
-                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
        }

-        template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
-        template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
+        template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
    } // namespace bf_knnmatch
 }}} // namespace cv { namespace gpu { namespace device {

--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@ -42,7 +42,9 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device
            s_distance += threadIdx.y * BLOCK_SIZE;
            s_trainIdx += threadIdx.y * BLOCK_SIZE;

-            s_distance[threadIdx.x] = bestDistance;
-            s_trainIdx[threadIdx.x] = bestTrainIdx;
-
-            __syncthreads();
-
-            reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
+            reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
        }

        template <int BLOCK_SIZE>
@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device
            s_trainIdx += threadIdx.y * BLOCK_SIZE;
            s_imgIdx   += threadIdx.y * BLOCK_SIZE;

-            s_distance[threadIdx.x] = bestDistance;
-            s_trainIdx[threadIdx.x] = bestTrainIdx;
-            s_imgIdx  [threadIdx.x] = bestImgIdx;
-
-            __syncthreads();
-
-            reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());
+            reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
        }

        ///////////////////////////////////////////////////////////////////////////////
@ -567,9 +558,8 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T, typename Mask>
        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                             int cc, cudaStream_t stream)
+                             cudaStream_t stream)
        {
-            (void)cc;
            if (query.cols <= 64)
            {
                matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
@ -599,9 +589,8 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T, typename Mask>
        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                             int cc, cudaStream_t stream)
+                             cudaStream_t stream)
        {
-            (void)cc;
            if (query.cols <= 64)
            {
                matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
@ -633,151 +622,151 @@ namespace cv { namespace gpu { namespace device

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                                               const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                                               int cc, cudaStream_t stream)
+                                               cudaStream_t stream)
        {
            if (mask.data)
            {
                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
                    trainIdx, distance,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
                    trainIdx, distance,
-                    cc, stream);
+                    stream);
            }
        }

-        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                                               const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                                               int cc, cudaStream_t stream)
+                                               cudaStream_t stream)
        {
            if (mask.data)
            {
                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
                    trainIdx, distance,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
                    trainIdx, distance,
-                    cc, stream);
+                    stream);
            }
        }

-        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                                                    const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                                                    int cc, cudaStream_t stream)
+                                                    cudaStream_t stream)
        {
            if (mask.data)
            {
                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
                    trainIdx, distance,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
                    trainIdx, distance,
-                    cc, stream);
+                    stream);
            }
        }

-        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                                               int cc, cudaStream_t stream)
+                                                cudaStream_t stream)
        {
            if (masks.data)
            {
                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
                    trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
                    trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
            }
        }

-        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                                               int cc, cudaStream_t stream)
+                                               cudaStream_t stream)
        {
            if (masks.data)
            {
                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
                    trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
                    trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
            }
        }

-        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                                    const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                                                    int cc, cudaStream_t stream)
+                                                    cudaStream_t stream)
        {
            if (masks.data)
            {
                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
                    trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
                    trainIdx, imgIdx, distance,
-                    cc, stream);
+                    stream);
            }
        }

-        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    } // namespace bf_match
 }}} // namespace cv { namespace gpu { namespace device {

--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@ -42,7 +42,8 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device
        __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-
            extern __shared__ int smem[];

            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device
                    bestDistance.ptr(queryIdx)[ind] = distVal;
                }
            }
-
-            #endif
        }

        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device
        __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-
            extern __shared__ int smem[];

            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device
                    bestDistance.ptr(queryIdx)[ind] = distVal;
                }
            }
-
-            #endif
        }

        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
@ -281,9 +274,8 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T, typename Mask>
        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream)
+                             cudaStream_t stream)
        {
-            (void)cc;
            if (query.cols <= 64)
            {
                matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
@ -313,9 +305,8 @@ namespace cv { namespace gpu { namespace device
        template <typename Dist, typename T>
        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream)
+                             cudaStream_t stream)
        {
-            (void)cc;
            if (query.cols <= 64)
            {
                matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
@ -347,124 +338,124 @@ namespace cv { namespace gpu { namespace device

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (mask.data)
            {
                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
                    trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
                    trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
            }
        }

-        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (mask.data)
            {
                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
                    trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
                    trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
            }
        }

-        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            if (mask.data)
            {
                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
                    trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
            }
            else
            {
                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
                    trainIdx, distance, nMatches,
-                    cc, stream);
+                    stream);
            }
        }

-        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
                trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
+                stream);
        }

-        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
                trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
+                stream);
        }

-        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
+            cudaStream_t stream)
        {
            matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
                trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
+                stream);
        }

-        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
    } // namespace bf_radius_match
 }}} // namespace cv { namespace gpu { namespace device

--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@ -42,9 +42,10 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/reduce.hpp"

 namespace cv { namespace gpu { namespace device
 {
@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device
                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
            }
+            __device__ __forceinline__ TransformOp() {}
+            __device__ __forceinline__ TransformOp(const TransformOp&) {}
        };

        void call(const PtrStepSz<float3> src, const float* rot,
@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device
                        (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
                        (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
            }
+            __device__ __forceinline__ ProjectOp() {}
+            __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
        };

        void call(const PtrStepSz<float3> src, const float* rot,
@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device
            return x * x;
        }

+        template <int BLOCK_SIZE>
        __global__ void computeHypothesisScoresKernel(
                const int num_points, const float3* object, const float2* image,
                const float dist_threshold, int* g_num_inliers)
@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device
                    ++num_inliers;
            }

-            extern __shared__ float s_num_inliers[];
-            s_num_inliers[threadIdx.x] = num_inliers;
-            __syncthreads();
-
-            for (int step = blockDim.x / 2; step > 0; step >>= 1)
-            {
-                if (threadIdx.x < step)
-                    s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
-                __syncthreads();
-            }
+            __shared__ int s_num_inliers[BLOCK_SIZE];
+            reduce<BLOCK_SIZE>(s_num_inliers, num_inliers, threadIdx.x, plus<int>());

            if (threadIdx.x == 0)
-                g_num_inliers[blockIdx.x] = s_num_inliers[0];
+                g_num_inliers[blockIdx.x] = num_inliers;
        }

        void computeHypothesisScores(
@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device

            dim3 threads(256);
            dim3 grid(num_hypotheses);
-            int smem_size = threads.x * sizeof(float);

-            computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
+            computeHypothesisScoresKernel<256><<<grid, threads>>>(
                    num_points, object, image, dist_threshold, hypothesis_scores);
            cudaSafeCall( cudaGetLastError() );

--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@ -43,172 +43,148 @@
 #if !defined CUDA_DISABLER

 #include <utility>
-#include <algorithm>
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/emulation.hpp"
+#include "opencv2/gpu/device/transform.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;

-namespace cv { namespace gpu { namespace device
-{
 namespace canny
 {
-        __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+    struct L1 : binary_function<int, int, float>
    {
-            __shared__ int smem[16][18];
-
-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (i < rows)
-            {
-                smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
-                if (threadIdx.x == 0)
-                {
-                    smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
-                    smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
-                }
-                __syncthreads();
-
-                if (j < cols)
-                {
-                    dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
-                    dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
-                }
-            }
-        }
-
-        void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
-        {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-            calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        struct L1
-        {
-            static __device__ __forceinline__ float calc(int x, int y)
+        __device__ __forceinline__ float operator ()(int x, int y) const
        {
            return ::abs(x) + ::abs(y);
        }
+
+        __device__ __forceinline__ L1() {}
+        __device__ __forceinline__ L1(const L1&) {}
    };
-        struct L2
+    struct L2 : binary_function<int, int, float>
    {
-            static __device__ __forceinline__ float calc(int x, int y)
+        __device__ __forceinline__ float operator ()(int x, int y) const
        {
            return ::sqrtf(x * x + y * y);
        }
+
+        __device__ __forceinline__ L2() {}
+        __device__ __forceinline__ L2(const L2&) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace canny
+{
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
+    struct SrcTex
+    {
+        const int xoff;
+        const int yoff;
+        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
+
+        __device__ __forceinline__ int operator ()(int y, int x) const
+        {
+            return tex2D(tex_src, x + xoff, y + yoff);
+        }
    };

-        template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
-            PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+    template <class Norm> __global__
+    void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
    {
-            __shared__ int sdx[18][16];
-            __shared__ int sdy[18][16];
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+        if (y >= mag.rows || x >= mag.cols)
+            return;

-            if (j < cols)
-            {
-                sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
-                sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
-                if (threadIdx.y == 0)
-                {
-                    sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
-                    sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
+        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
+        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));

-                    sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
-                    sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
-                }
-                __syncthreads();
+        dx(y, x) = dxVal;
+        dy(y, x) = dyVal;

-                if (i < rows)
-                {
-                    int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
-                    int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
-
-                    dx.ptr(i)[j] = x;
-                    dy.ptr(i)[j] = y;
-
-                    mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
-                }
-            }
+        mag(y, x) = norm(dxVal, dyVal);
    }

-        void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
    {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
+
+        bindTexture(&tex_src, srcWhole);
+        SrcTex src(xoff, yoff);

        if (L2Grad)
-                calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+        {
+            L2 norm;
+            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+        }
        else
-                calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+        {
+            L1 norm;
+            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+        }

        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall(cudaThreadSynchronize());
    }

-        template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
    {
-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (i < rows && j < cols)
-                mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
-        }
-
-        void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
-        {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
        if (L2Grad)
-                calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
+        {
+            L2 norm;
+            transform(dx, dy, mag, norm, WithOutMask(), 0);
+        }
        else
-                calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
+        {
+            L1 norm;
+            transform(dx, dy, mag, norm, WithOutMask(), 0);
+        }
+    }
 }

 //////////////////////////////////////////////////////////////////////////////////////////

-        #define CANNY_SHIFT 15
-        #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
-
-        __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+namespace canny
 {
-            __shared__ float smem[18][18];
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);

-            const int j = blockIdx.x * 16 + threadIdx.x;
-            const int i = blockIdx.y * 16 + threadIdx.y;
-
-            const int tid = threadIdx.y * 16 + threadIdx.x;
-            const int lx = tid % 18;
-            const int ly = tid / 18;
-
-            if (ly < 14)
-                smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
-
-            if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
-                smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
-
-            __syncthreads();
-
-            if (i < rows && j < cols)
+    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
    {
-                int x = dx.ptr(i)[j];
-                int y = dy.ptr(i)[j];
-                const int s = (x ^ y) < 0 ? -1 : 1;
-                const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
+        const int CANNY_SHIFT = 15;
+        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);

-                x = ::abs(x);
-                y = ::abs(y);
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
+            return;
+
+        int dxVal = dx(y, x);
+        int dyVal = dy(y, x);
+
+        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
+        const float m = tex2D(tex_mag, x, y);
+
+        dxVal = ::abs(dxVal);
+        dyVal = ::abs(dyVal);

        // 0 - the pixel can not belong to an edge
        // 1 - the pixel might belong to an edge
@ -217,73 +193,81 @@ namespace cv { namespace gpu { namespace device

        if (m > low_thresh)
        {
-                    const int tg22x = x * TG22;
-                    const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
+            const int tg22x = dxVal * TG22;
+            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);

-                    y <<= CANNY_SHIFT;
+            dyVal <<= CANNY_SHIFT;

-                    if (y < tg22x)
+            if (dyVal < tg22x)
            {
-                        if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
+                if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
                    edge_type = 1 + (int)(m > high_thresh);
            }
-                    else if( y > tg67x )
+            else if(dyVal > tg67x)
            {
-                        if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
+                if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
                    edge_type = 1 + (int)(m > high_thresh);
            }
            else
            {
-                        if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
+                if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
                    edge_type = 1 + (int)(m > high_thresh);
            }
        }

-                map.ptr(i + 1)[j + 1] = edge_type;
-            }
+        map(y, x) = edge_type;
    }

-        #undef CANNY_SHIFT
-        #undef TG22
-
-        void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
    {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));

-            calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
+        bindTexture(&tex_mag, mag);
+
+        calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall( cudaDeviceSynchronize() );
    }
+}

 //////////////////////////////////////////////////////////////////////////////////////////

-        __device__ unsigned int counter = 0;
-
-        __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
+namespace canny
 {
-            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120)
+    __device__ int counter = 0;

-            __shared__ int smem[18][18];
+    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
+    {
+        __shared__ volatile int smem[18][18];

-            const int j = blockIdx.x * 16 + threadIdx.x;
-            const int i = blockIdx.y * 16 + threadIdx.y;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            const int tid = threadIdx.y * 16 + threadIdx.x;
-            const int lx = tid % 18;
-            const int ly = tid / 18;
-
-            if (ly < 14)
-                smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
-
-            if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
-                smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+        smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
+        if (threadIdx.y == 0)
+            smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
+        if (threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
+        if (threadIdx.x == 0)
+            smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1)
+            smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
+        if (threadIdx.x == 0 && threadIdx.y == 0)
+            smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
+            smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
+        if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;

        __syncthreads();

-            if (i < rows && j < cols)
-            {
+        if (x >= map.cols || y >= map.rows)
+            return;
+
        int n;

        #pragma unroll
@ -311,7 +295,7 @@ namespace cv { namespace gpu { namespace device

        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];

-                map.ptr(i + 1)[j + 1] = e;
+        map(y, x) = e;

        n = 0;

@ -331,69 +315,70 @@ namespace cv { namespace gpu { namespace device

        if (n > 0)
        {
-                    const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
-                    st[ind] = make_ushort2(j + 1, i + 1);
+            const int ind =  ::atomicAdd(&counter, 1);
+            st[ind] = make_ushort2(x, y);
        }
    }

-            #endif
-        }
-
-        void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
+    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
    {
        void* counter_ptr;
        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );

-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );

-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));

-            edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
+        edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall( cudaDeviceSynchronize() );
    }
+}

+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};

-        __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
+    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
    {
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120
-
        const int stack_size = 512;

-            __shared__ unsigned int s_counter;
-            __shared__ unsigned int s_ind;
+        __shared__ int s_counter;
+        __shared__ int s_ind;
        __shared__ ushort2 s_st[stack_size];

        if (threadIdx.x == 0)
            s_counter = 0;
+
        __syncthreads();

        int ind = blockIdx.y * gridDim.x + blockIdx.x;

-            if (ind < count)
-            {
+        if (ind >= count)
+            return;
+
        ushort2 pos = st1[ind];

-                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
-                {
        if (threadIdx.x < 8)
        {
            pos.x += c_dx[threadIdx.x];
            pos.y += c_dy[threadIdx.x];

-                        if (map.ptr(pos.y)[pos.x] == 1)
+            if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
            {
-                            map.ptr(pos.y)[pos.x] = 2;
+                map(pos.y, pos.x) = 2;

-                            ind = atomicInc(&s_counter, (unsigned int)(-1));
+                ind = Emulation::smem::atomicAdd(&s_counter, 1);

                s_st[ind] = pos;
            }
        }
+
        __syncthreads();

        while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
@ -401,30 +386,31 @@ namespace cv { namespace gpu { namespace device
            const int subTaskIdx = threadIdx.x >> 3;
            const int portion = ::min(s_counter, blockDim.x >> 3);

-                        pos.x = pos.y = 0;
-
            if (subTaskIdx < portion)
                pos = s_st[s_counter - 1 - subTaskIdx];
+
            __syncthreads();

            if (threadIdx.x == 0)
                s_counter -= portion;
+
            __syncthreads();

-                        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+            if (subTaskIdx < portion)
            {
                pos.x += c_dx[threadIdx.x & 7];
                pos.y += c_dy[threadIdx.x & 7];

-                            if (map.ptr(pos.y)[pos.x] == 1)
+                if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
                {
-                                map.ptr(pos.y)[pos.x] = 2;
+                    map(pos.y, pos.x) = 2;

-                                ind = atomicInc(&s_counter, (unsigned int)(-1));
+                    ind = Emulation::smem::atomicAdd(&s_counter, 1);

                    s_st[ind] = pos;
                }
            }
+
            __syncthreads();
        }

@ -432,70 +418,76 @@ namespace cv { namespace gpu { namespace device
        {
            if (threadIdx.x == 0)
            {
-                            ind = atomicAdd(&counter, s_counter);
+                ind = ::atomicAdd(&counter, s_counter);
                s_ind = ind - s_counter;
            }
+
            __syncthreads();

            ind = s_ind;

            for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
-                        {
                st2[ind + i] = s_st[i];
        }
    }
-                }
-            }

-            #endif
-        }
-
-        void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
+    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
    {
        void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
+        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );

-            unsigned int count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+        int count;
+        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );

        while (count > 0)
        {
-                cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );

-                dim3 block(128, 1, 1);
-                dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
-                edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
+            const dim3 block(128);
+            const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
+
+            edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaDeviceSynchronize() );

-                cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );

            std::swap(st1, st2);
        }
    }
-
-        __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
-        {
-            const int j = blockIdx.x * 16 + threadIdx.x;
-            const int i = blockIdx.y * 16 + threadIdx.y;
-
-            if (i < rows && j < cols)
-                dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
 }

-        void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
 {
-            dim3 block(16, 16, 1);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-            getEdges<<<grid, block>>>(map, dst, rows, cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
+    struct GetEdges : unary_function<int, uchar>
+    {
+        __device__ __forceinline__ uchar operator ()(int e) const
+        {
+            return (uchar)(-(e >> 1));
        }
-    } // namespace canny
-}}} // namespace cv { namespace gpu { namespace device

+        __device__ __forceinline__ GetEdges() {}
+        __device__ __forceinline__ GetEdges(const GetEdges&) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace canny
+{
+    void getEdges(PtrStepSzi map, PtrStepSzb dst)
+    {
+        transform(map, dst, GetEdges(), WithOutMask(), 0);
+    }
+}

 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/fgd_bgfg.cu
+++ b/modules/gpu/src/cuda/fgd_bgfg.cu
@ -46,6 +46,8 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/gpu/device/functional.hpp"
 #include "fgd_bgfg_common.hpp"

 using namespace cv::gpu;
@ -181,57 +183,8 @@ namespace bgfg
        __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
        __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];

-        data0[threadIdx.x] = sum0;
-        data1[threadIdx.x] = sum1;
-        data2[threadIdx.x] = sum2;
-        __syncthreads();
-
-        if (threadIdx.x < 128)
-        {
-            data0[threadIdx.x] = sum0 += data0[threadIdx.x + 128];
-            data1[threadIdx.x] = sum1 += data1[threadIdx.x + 128];
-            data2[threadIdx.x] = sum2 += data2[threadIdx.x + 128];
-        }
-        __syncthreads();
-
-        if (threadIdx.x < 64)
-        {
-            data0[threadIdx.x] = sum0 += data0[threadIdx.x + 64];
-            data1[threadIdx.x] = sum1 += data1[threadIdx.x + 64];
-            data2[threadIdx.x] = sum2 += data2[threadIdx.x + 64];
-        }
-        __syncthreads();
-
-        if (threadIdx.x < 32)
-        {
-            volatile unsigned int* vdata0 = data0;
-            volatile unsigned int* vdata1 = data1;
-            volatile unsigned int* vdata2 = data2;
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 32];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 32];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 32];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 16];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 16];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 16];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 8];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 8];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 8];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 4];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 4];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 4];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 2];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 2];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 2];
-
-            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 1];
-            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 1];
-            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 1];
-        }
+        plus<unsigned int> op;
+        reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));

        if(threadIdx.x == 0)
        {
@ -245,9 +198,9 @@ namespace bgfg
    void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
-                               int cc, cudaStream_t stream)
+                               bool cc20, cudaStream_t stream)
    {
-        const int HISTOGRAM_WARP_COUNT = cc < 20 ? 4 : 6;
+        const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;

        calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
@ -261,10 +214,10 @@ namespace bgfg
            cudaSafeCall( cudaDeviceSynchronize() );
    }

-    template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);

    /////////////////////////////////////////////////////////////////////////
    // calcDiffThreshMask
--- a/modules/gpu/src/cuda/fgd_bgfg_common.hpp
+++ b/modules/gpu/src/cuda/fgd_bgfg_common.hpp
@ -125,7 +125,7 @@ namespace bgfg
    void calcDiffHistogram_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
-                               int cc, cudaStream_t stream);
+                               bool cc20, cudaStream_t stream);

    template <typename PT, typename CT>
    void calcDiffThreshMask_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);
--- a/modules/gpu/src/cuda/global_motion.cu
+++ b/modules/gpu/src/cuda/global_motion.cu
@ -43,12 +43,10 @@

 #if !defined CUDA_DISABLER

-#include "thrust/device_ptr.h"
-#include "thrust/remove.h"
-#include "thrust/functional.h"
-#include "internal_shared.hpp"
-
-using namespace thrust;
+#include <thrust/device_ptr.h>
+#include <thrust/remove.h>
+#include <thrust/functional.h>
+#include "opencv2/gpu/device/common.hpp"

 namespace cv { namespace gpu { namespace device { namespace globmotion {

@ -61,10 +59,10 @@ int compactPoints(int N, float *points0, float *points1, const uchar *mask)
    thrust::device_ptr<float2> dpoints1((float2*)points1);
    thrust::device_ptr<const uchar> dmask(mask);

-    return thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
+    return (int)(thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
                             thrust::make_zip_iterator(thrust::make_tuple(dpoints0 + N, dpoints1 + N)),
                             dmask, thrust::not1(thrust::identity<uchar>()))
-           - make_zip_iterator(make_tuple(dpoints0, dpoints1));
+           - thrust::make_zip_iterator(make_tuple(dpoints0, dpoints1)));
 }


--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@ -43,182 +43,112 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/emulation.hpp"
+#include "opencv2/gpu/device/transform.hpp"

-namespace cv { namespace gpu { namespace device
-{
-    #define UINT_BITS 32U
-
-    //Warps == subhistograms per threadblock
-    #define WARP_COUNT 6
-
-    //Threadblock size
-    #define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
-    #define HISTOGRAM256_BIN_COUNT 256
-
-    //Shared memory per threadblock
-    #define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
-
-    #define PARTIAL_HISTOGRAM256_COUNT 240
-
-    #define MERGE_THREADBLOCK_SIZE 256
-
-    #define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120))
+using namespace cv::gpu;
+using namespace cv::gpu::device;

 namespace hist
 {
-        #if (!USE_SMEM_ATOMICS)
-
-            #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
-
-            __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
    {
-                uint count;
-                do
+        __shared__ int shist[256];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        shist[tid] = 0;
+        __syncthreads();
+
+        if (y < rows)
        {
-                    count = s_WarpHist[data] & TAG_MASK;
-                    count = threadTag | (count + 1);
-                    s_WarpHist[data] = count;
-                } while (s_WarpHist[data] != count);
+            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
+
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+            {
+                unsigned int data = rowPtr[x];
+
+                Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
            }

-        #else
-
-            #define TAG_MASK 0xFFFFFFFFU
-
-            __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
+            if (cols % 4 != 0 && threadIdx.x == 0)
            {
-                atomicAdd(s_WarpHist + data, 1);
+                for (int x = cols_4 * 4; x < cols; ++x)
+                {
+                    unsigned int data = ((const uchar*)rowPtr)[x];
+                    Emulation::smem::atomicAdd(&shist[data], 1);
+                }
            }
-
-        #endif
-
-        __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
-        {
-            uint x = pos_x << 2;
-
-            if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
-            if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
-            if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
-            if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
        }
-
-        __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
-        {
-            //Per-warp subhistogram storage
-            __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
-            uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
-
-            //Clear shared memory storage for current threadblock before processing
-            #pragma unroll
-            for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
-               s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
-
-            //Cycle through the entire data set, update subhistograms for each warp
-            const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);

        __syncthreads();
-            const uint colsui = d_Data.step / sizeof(uint);
-            for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
-            {
-                uint pos_y = pos / colsui;
-                uint pos_x = pos % colsui;
-                uint data = d_Data.ptr(pos_y)[pos_x];
-                addWord(s_WarpHist, data, tag, pos_x, cols);
+
+        const int histVal = shist[tid];
+        if (histVal > 0)
+            ::atomicAdd(hist + tid, histVal);
    }

-            //Merge per-warp histograms into per-block and write to global memory
-            __syncthreads();
-            for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
+    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
    {
-                uint sum = 0;
-
-                for (uint i = 0; i < WARP_COUNT; i++)
-                    sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
-
-                d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
-            }
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////
-        // Merge histogram256() output
-        // Run one threadblock per bin; each threadblock adds up the same bin counter
-        // from every partial histogram. Reads are uncoalesced, but mergeHistogram256
-        // takes only a fraction of total processing time
-        ////////////////////////////////////////////////////////////////////////////////
-
-        __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
-        {
-            uint sum = 0;
-
-            #pragma unroll
-            for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
-                sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
-
-            __shared__ uint data[MERGE_THREADBLOCK_SIZE];
-            data[threadIdx.x] = sum;
-
-            for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
-            {
-                __syncthreads();
-                if(threadIdx.x < stride)
-                    data[threadIdx.x] += data[threadIdx.x + stride];
-            }
-
-            if(threadIdx.x == 0)
-                d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
-        }
-
-        void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream)
-        {
-            histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
-                PtrStepSz<uint>(src),
-                buf,
-                static_cast<uint>(src.rows * src.step / sizeof(uint)),
-                src.cols);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.rows, block.y));

+        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
+}

+/////////////////////////////////////////////////////////////////////////
+
+namespace hist
+{
    __constant__ int c_lut[256];

-        __global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst)
+    struct EqualizeHist : unary_function<uchar, uchar>
    {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        float scale;

-            if (x < src.cols && y < src.rows)
+        __host__ EqualizeHist(float _scale) : scale(_scale) {}
+
+        __device__ __forceinline__ uchar operator ()(uchar val) const
        {
-                const uchar val = src.ptr(y)[x];
            const int lut = c_lut[val];
-                dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
+            return __float2int_rn(scale * lut);
        }
+    };
 }

-        void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
+namespace cv { namespace gpu { namespace device
 {
-            dim3 block(16, 16);
-            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-
-            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
-
-            equalizeHist<<<grid, block, 0, stream>>>(src, dst);
-            cudaSafeCall( cudaGetLastError() );
+    template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}

+namespace hist
+{
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
+    {
        if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    } // namespace hist
-}}} // namespace cv { namespace gpu { namespace device
+            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
+        else
+            cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );

+        const float scale = 255.0f / (src.cols * src.rows);
+
+        transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
+    }
+}

 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@ -42,7 +42,10 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"

 namespace cv { namespace gpu { namespace device
 {
@ -226,29 +229,32 @@ namespace cv { namespace gpu { namespace device


        template<int size>
-        __device__ float reduce_smem(volatile float* smem)
+        __device__ float reduce_smem(float* smem, float val)
        {
            unsigned int tid = threadIdx.x;
-            float sum = smem[tid];
+            float sum = val;

-            if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }
-            if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
-            if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
+            reduce<size>(smem, sum, tid, plus<float>());

-            if (tid < 32)
+            if (size == 32)
            {
-                if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
-                if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
-                if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
-                if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
-                if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
-                if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
+            #if __CUDA_ARCH__ >= 300
+                return shfl(sum, 0);
+            #else
+                return smem[0];
+            #endif
            }
+            else
+            {
+            #if __CUDA_ARCH__ >= 300
+                if (threadIdx.x == 0)
+                    smem[0] = sum;
+            #endif

                __syncthreads();
-            sum = smem[0];

-            return sum;
+                return smem[0];
+            }
        }


@ -272,19 +278,13 @@ namespace cv { namespace gpu { namespace device
            if (threadIdx.x < block_hist_size)
                elem = hist[0];

-            squares[threadIdx.x] = elem * elem;
-
-            __syncthreads();
-            float sum = reduce_smem<nthreads>(squares);
+            float sum = reduce_smem<nthreads>(squares, elem * elem);

            float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
            elem = ::min(elem * scale, threshold);

-            __syncthreads();
-            squares[threadIdx.x] = elem * elem;
+            sum = reduce_smem<nthreads>(squares, elem * elem);

-            __syncthreads();
-            sum = reduce_smem<nthreads>(squares);
            scale = 1.0f / (::sqrtf(sum) + 1e-3f);

            if (threadIdx.x < block_hist_size)
@ -355,40 +355,11 @@ namespace cv { namespace gpu { namespace device
           __shared__ float products[nthreads * nblocks];

           const int tid = threadIdx.z * nthreads + threadIdx.x;
-               products[tid] = product;

-               __syncthreads();
-
-               if (nthreads >= 512)
-               {
-                       if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
-                       __syncthreads();
-               }
-               if (nthreads >= 256)
-               {
-                       if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
-                       __syncthreads();
-               }
-               if (nthreads >= 128)
-               {
-                       if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
-                       __syncthreads();
-               }
-
-               if (threadIdx.x < 32)
-               {
-                       volatile float* smem = products;
-                       if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
-                       if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
-                       if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
-                       if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
-                       if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
-                       if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
-               }
+           reduce<nthreads>(products, product, tid, plus<float>());

           if (threadIdx.x == 0)
-                       confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x]
-                               = (float)(product + free_coef);
+               confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef;

       }

@ -446,36 +417,8 @@ namespace cv { namespace gpu { namespace device
            __shared__ float products[nthreads * nblocks];

            const int tid = threadIdx.z * nthreads + threadIdx.x;
-            products[tid] = product;

-            __syncthreads();
-
-            if (nthreads >= 512)
-            {
-                if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
-                __syncthreads();
-            }
-            if (nthreads >= 256)
-            {
-                if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
-                __syncthreads();
-            }
-            if (nthreads >= 128)
-            {
-                if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
-                __syncthreads();
-            }
-
-            if (threadIdx.x < 32)
-            {
-                volatile float* smem = products;
-                if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
-                if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
-                if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
-                if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
-                if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
-                if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
-            }
+            reduce<nthreads>(products, product, tid, plus<float>());

            if (threadIdx.x == 0)
                labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/cuda/nlm.cu
+++ b/modules/gpu/src/cuda/nlm.cu
@ -43,11 +43,11 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
-
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/vec_traits.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/block.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"

 using namespace cv::gpu;
@ -184,6 +184,85 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
+
+        template <int cn> struct Unroll;
+        template <> struct Unroll<1>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
+            {
+                return thrust::tie(val1, val2);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op);
+            }
+        };
+        template <> struct Unroll<2>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op);
+            }
+        };
+        template <> struct Unroll<3>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y, val2.z);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op, op);
+            }
+        };
+        template <> struct Unroll<4>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op, op, op);
+            }
+        };
+
        __device__ __forceinline__ int calcDist(const uchar&  a, const uchar&  b) { return (a-b)*(a-b); }
        __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
        __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
@ -340,29 +419,14 @@ namespace cv { namespace gpu { namespace device
                    sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
                }

-                volatile __shared__ float cta_buffer[CTA_SIZE];
+                __shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)];

-                int tid = threadIdx.x;
+                reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer),
+                                 Unroll<VecTraits<T>::cn>::tie(weights_sum, sum),
+                                 threadIdx.x,
+                                 Unroll<VecTraits<T>::cn>::op());

-                cta_buffer[tid] = weights_sum;
-                __syncthreads();
-                Block::reduce<CTA_SIZE>(cta_buffer, plus());
-                weights_sum = cta_buffer[0];
-
-                __syncthreads();
-
-
-                for(int n = 0; n < VecTraits<T>::cn; ++n)
-                {
-                    cta_buffer[tid] = reinterpret_cast<float*>(&sum)[n];
-                    __syncthreads();
-                    Block::reduce<CTA_SIZE>(cta_buffer, plus());
-                    reinterpret_cast<float*>(&sum)[n] = cta_buffer[0];
-
-                    __syncthreads();
-                }
-
-                if (tid == 0)
+                if (threadIdx.x == 0)
                    dst = saturate_cast<T>(sum / weights_sum);
            }

--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@ -47,10 +47,11 @@

 #if !defined CUDA_DISABLER

+#include <thrust/device_ptr.h>
 #include <thrust/sort.h>

 #include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/functional.hpp"

 namespace cv { namespace gpu { namespace device
@ -75,9 +76,9 @@ namespace cv { namespace gpu { namespace device

        __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
        {
-            __shared__ int smem[8 * 32];
-
-            volatile int* srow = smem + threadIdx.y * blockDim.x;
+            __shared__ int smem0[8 * 32];
+            __shared__ int smem1[8 * 32];
+            __shared__ int smem2[8 * 32];

            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;

@ -109,9 +110,12 @@ namespace cv { namespace gpu { namespace device
                    c += Ix * Iy;
                }

-                reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
-                reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
-                reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
+                int* srow0 = smem0 + threadIdx.y * blockDim.x;
+                int* srow1 = smem1 + threadIdx.y * blockDim.x;
+                int* srow2 = smem2 + threadIdx.y * blockDim.x;
+
+                plus<int> op;
+                reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));

                if (threadIdx.x == 0)
                {
@ -151,9 +155,13 @@ namespace cv { namespace gpu { namespace device

        __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
        {
-            __shared__ int smem[8 * 32];
+            __shared__ int smem0[8 * 32];
+            __shared__ int smem1[8 * 32];

-            volatile int* srow = smem + threadIdx.y * blockDim.x;
+            int* srow0 = smem0 + threadIdx.y * blockDim.x;
+            int* srow1 = smem1 + threadIdx.y * blockDim.x;
+
+            plus<int> op;

            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;

@ -167,7 +175,7 @@ namespace cv { namespace gpu { namespace device
                for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
                    m_10 += u * image(loc.y, loc.x + u);

-                reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
+                reduce<32>(srow0, m_10, threadIdx.x, op);

                for (int v = 1; v <= half_k; ++v)
                {
@ -185,8 +193,7 @@ namespace cv { namespace gpu { namespace device
                        m_sum += u * (val_plus + val_minus);
                    }

-                    reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
-                    reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
+                    reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));

                    m_10 += m_sum;
                    m_01 += v * v_sum;
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
@ -52,167 +52,19 @@
 #include "opencv2/gpu/device/functional.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;

-namespace cv { namespace gpu { namespace device
-{
 namespace pyrlk
 {
    __constant__ int c_winSize_x;
    __constant__ int c_winSize_y;
-
    __constant__ int c_halfWin_x;
    __constant__ int c_halfWin_y;
-
    __constant__ int c_iters;

-        void loadConstants(int2 winSize, int iters)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
-
-            int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-            cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
-        }
-
-        __device__ void reduce(float& val1, float& val2, float& val3, float* smem1, float* smem2, float* smem3, int tid)
-        {
-            smem1[tid] = val1;
-            smem2[tid] = val2;
-            smem3[tid] = val3;
-            __syncthreads();
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
-            if (tid < 128)
-            {
-                smem1[tid] = val1 += smem1[tid + 128];
-                smem2[tid] = val2 += smem2[tid + 128];
-                smem3[tid] = val3 += smem3[tid + 128];
-            }
-            __syncthreads();
-#endif
-
-            if (tid < 64)
-            {
-                smem1[tid] = val1 += smem1[tid + 64];
-                smem2[tid] = val2 += smem2[tid + 64];
-                smem3[tid] = val3 += smem3[tid + 64];
-            }
-            __syncthreads();
-
-            if (tid < 32)
-            {
-                volatile float* vmem1 = smem1;
-                volatile float* vmem2 = smem2;
-                volatile float* vmem3 = smem3;
-
-                vmem1[tid] = val1 += vmem1[tid + 32];
-                vmem2[tid] = val2 += vmem2[tid + 32];
-                vmem3[tid] = val3 += vmem3[tid + 32];
-
-                vmem1[tid] = val1 += vmem1[tid + 16];
-                vmem2[tid] = val2 += vmem2[tid + 16];
-                vmem3[tid] = val3 += vmem3[tid + 16];
-
-                vmem1[tid] = val1 += vmem1[tid + 8];
-                vmem2[tid] = val2 += vmem2[tid + 8];
-                vmem3[tid] = val3 += vmem3[tid + 8];
-
-                vmem1[tid] = val1 += vmem1[tid + 4];
-                vmem2[tid] = val2 += vmem2[tid + 4];
-                vmem3[tid] = val3 += vmem3[tid + 4];
-
-                vmem1[tid] = val1 += vmem1[tid + 2];
-                vmem2[tid] = val2 += vmem2[tid + 2];
-                vmem3[tid] = val3 += vmem3[tid + 2];
-
-                vmem1[tid] = val1 += vmem1[tid + 1];
-                vmem2[tid] = val2 += vmem2[tid + 1];
-                vmem3[tid] = val3 += vmem3[tid + 1];
-            }
-        }
-
-        __device__ void reduce(float& val1, float& val2, float* smem1, float* smem2, int tid)
-        {
-            smem1[tid] = val1;
-            smem2[tid] = val2;
-            __syncthreads();
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
-            if (tid < 128)
-            {
-                smem1[tid] = val1 += smem1[tid + 128];
-                smem2[tid] = val2 += smem2[tid + 128];
-            }
-            __syncthreads();
-#endif
-
-            if (tid < 64)
-            {
-                smem1[tid] = val1 += smem1[tid + 64];
-                smem2[tid] = val2 += smem2[tid + 64];
-            }
-            __syncthreads();
-
-            if (tid < 32)
-            {
-                volatile float* vmem1 = smem1;
-                volatile float* vmem2 = smem2;
-
-                vmem1[tid] = val1 += vmem1[tid + 32];
-                vmem2[tid] = val2 += vmem2[tid + 32];
-
-                vmem1[tid] = val1 += vmem1[tid + 16];
-                vmem2[tid] = val2 += vmem2[tid + 16];
-
-                vmem1[tid] = val1 += vmem1[tid + 8];
-                vmem2[tid] = val2 += vmem2[tid + 8];
-
-                vmem1[tid] = val1 += vmem1[tid + 4];
-                vmem2[tid] = val2 += vmem2[tid + 4];
-
-                vmem1[tid] = val1 += vmem1[tid + 2];
-                vmem2[tid] = val2 += vmem2[tid + 2];
-
-                vmem1[tid] = val1 += vmem1[tid + 1];
-                vmem2[tid] = val2 += vmem2[tid + 1];
-            }
-        }
-
-        __device__ void reduce(float& val1, float* smem1, int tid)
-        {
-            smem1[tid] = val1;
-            __syncthreads();
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
-            if (tid < 128)
-            {
-                smem1[tid] = val1 += smem1[tid + 128];
-            }
-            __syncthreads();
-#endif
-
-            if (tid < 64)
-            {
-                smem1[tid] = val1 += smem1[tid + 64];
-            }
-            __syncthreads();
-
-            if (tid < 32)
-            {
-                volatile float* vmem1 = smem1;
-
-                vmem1[tid] = val1 += vmem1[tid + 32];
-                vmem1[tid] = val1 += vmem1[tid + 16];
-                vmem1[tid] = val1 += vmem1[tid + 8];
-                vmem1[tid] = val1 += vmem1[tid + 4];
-                vmem1[tid] = val1 += vmem1[tid + 2];
-                vmem1[tid] = val1 += vmem1[tid + 1];
-            }
-        }
-
    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
@ -263,7 +115,7 @@ namespace cv { namespace gpu { namespace device

    __device__ __forceinline__ float abs_(float a)
    {
-            return ::fabs(a);
+        return ::fabsf(a);
    }
    __device__ __forceinline__ float4 abs_(const float4& a)
    {
@ -271,19 +123,19 @@ namespace cv { namespace gpu { namespace device
    }

    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
-        __global__ void lkSparse(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
+    __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
    {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ <= 110)
-            __shared__ float smem1[128];
-            __shared__ float smem2[128];
-            __shared__ float smem3[128];
+    #if __CUDA_ARCH__ <= 110
+        const int BLOCK_SIZE = 128;
    #else
-            __shared__ float smem1[256];
-            __shared__ float smem2[256];
-            __shared__ float smem3[256];
+        const int BLOCK_SIZE = 256;
    #endif

-            const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        __shared__ float smem1[BLOCK_SIZE];
+        __shared__ float smem2[BLOCK_SIZE];
+        __shared__ float smem3[BLOCK_SIZE];
+
+        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;

        float2 prevPt = prevPts[blockIdx.x];
        prevPt.x *= (1.0f / (1 << level));
@ -338,7 +190,17 @@ namespace cv { namespace gpu { namespace device
            }
        }

-            reduce(A11, A12, A22, smem1, smem2, smem3, tid);
+        reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
+
+    #if __CUDA_ARCH__ >= 300
+        if (tid == 0)
+        {
+            smem1[0] = A11;
+            smem2[0] = A12;
+            smem3[0] = A22;
+        }
+    #endif
+
        __syncthreads();

        A11 = smem1[0];
@ -395,7 +257,16 @@ namespace cv { namespace gpu { namespace device
                }
            }

-                reduce(b1, b2, smem1, smem2, tid);
+            reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
+
+        #if __CUDA_ARCH__ >= 300
+            if (tid == 0)
+            {
+                smem1[0] = b1;
+                smem2[0] = b2;
+            }
+        #endif
+
            __syncthreads();

            b1 = smem1[0];
@ -428,7 +299,7 @@ namespace cv { namespace gpu { namespace device
                }
            }

-                reduce(errval, smem1, tid);
+            reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
        }

        if (tid == 0)
@ -444,15 +315,15 @@ namespace cv { namespace gpu { namespace device
    }

    template <int cn, int PATCH_X, int PATCH_Y>
-        void lkSparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+    void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
                       int level, dim3 block, cudaStream_t stream)
    {
        dim3 grid(ptcount);

        if (level == 0 && err)
-                lkSparse<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
+            sparseKernel<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
        else
-                lkSparse<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
+            sparseKernel<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);

        cudaSafeCall( cudaGetLastError() );

@ -460,52 +331,8 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaDeviceSynchronize() );
    }

-        void lkSparse1_gpu(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-            int level, dim3 block, dim3 patch, cudaStream_t stream)
-        {
-            typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                int level, dim3 block, cudaStream_t stream);
-
-            static const func_t funcs[5][5] =
-            {
-                {lkSparse_caller<1, 1, 1>, lkSparse_caller<1, 2, 1>, lkSparse_caller<1, 3, 1>, lkSparse_caller<1, 4, 1>, lkSparse_caller<1, 5, 1>},
-                {lkSparse_caller<1, 1, 2>, lkSparse_caller<1, 2, 2>, lkSparse_caller<1, 3, 2>, lkSparse_caller<1, 4, 2>, lkSparse_caller<1, 5, 2>},
-                {lkSparse_caller<1, 1, 3>, lkSparse_caller<1, 2, 3>, lkSparse_caller<1, 3, 3>, lkSparse_caller<1, 4, 3>, lkSparse_caller<1, 5, 3>},
-                {lkSparse_caller<1, 1, 4>, lkSparse_caller<1, 2, 4>, lkSparse_caller<1, 3, 4>, lkSparse_caller<1, 4, 4>, lkSparse_caller<1, 5, 4>},
-                {lkSparse_caller<1, 1, 5>, lkSparse_caller<1, 2, 5>, lkSparse_caller<1, 3, 5>, lkSparse_caller<1, 4, 5>, lkSparse_caller<1, 5, 5>}
-            };
-
-            bindTexture(&tex_If, I);
-            bindTexture(&tex_Jf, J);
-
-            funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
-                level, block, stream);
-        }
-
-        void lkSparse4_gpu(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-            int level, dim3 block, dim3 patch, cudaStream_t stream)
-        {
-            typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                int level, dim3 block, cudaStream_t stream);
-
-            static const func_t funcs[5][5] =
-            {
-                {lkSparse_caller<4, 1, 1>, lkSparse_caller<4, 2, 1>, lkSparse_caller<4, 3, 1>, lkSparse_caller<4, 4, 1>, lkSparse_caller<4, 5, 1>},
-                {lkSparse_caller<4, 1, 2>, lkSparse_caller<4, 2, 2>, lkSparse_caller<4, 3, 2>, lkSparse_caller<4, 4, 2>, lkSparse_caller<4, 5, 2>},
-                {lkSparse_caller<4, 1, 3>, lkSparse_caller<4, 2, 3>, lkSparse_caller<4, 3, 3>, lkSparse_caller<4, 4, 3>, lkSparse_caller<4, 5, 3>},
-                {lkSparse_caller<4, 1, 4>, lkSparse_caller<4, 2, 4>, lkSparse_caller<4, 3, 4>, lkSparse_caller<4, 4, 4>, lkSparse_caller<4, 5, 4>},
-                {lkSparse_caller<4, 1, 5>, lkSparse_caller<4, 2, 5>, lkSparse_caller<4, 3, 5>, lkSparse_caller<4, 4, 5>, lkSparse_caller<4, 5, 5>}
-            };
-
-            bindTexture(&tex_If4, I);
-            bindTexture(&tex_Jf4, J);
-
-            funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
-                level, block, stream);
-        }
-
    template <bool calcErr>
-        __global__ void lkDense(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
+    __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
    {
        extern __shared__ int smem[];

@ -650,8 +477,63 @@ namespace cv { namespace gpu { namespace device
        }
    }

-        void lkDense_gpu(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
-                         PtrStepSzf err, int2 winSize, cudaStream_t stream)
+    void loadConstants(int2 winSize, int iters)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
+
+        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
+        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
+
+        cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
+    }
+
+    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                 int level, dim3 block, dim3 patch, cudaStream_t stream)
+    {
+        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                               int level, dim3 block, cudaStream_t stream);
+
+        static const func_t funcs[5][5] =
+        {
+            {sparse_caller<1, 1, 1>, sparse_caller<1, 2, 1>, sparse_caller<1, 3, 1>, sparse_caller<1, 4, 1>, sparse_caller<1, 5, 1>},
+            {sparse_caller<1, 1, 2>, sparse_caller<1, 2, 2>, sparse_caller<1, 3, 2>, sparse_caller<1, 4, 2>, sparse_caller<1, 5, 2>},
+            {sparse_caller<1, 1, 3>, sparse_caller<1, 2, 3>, sparse_caller<1, 3, 3>, sparse_caller<1, 4, 3>, sparse_caller<1, 5, 3>},
+            {sparse_caller<1, 1, 4>, sparse_caller<1, 2, 4>, sparse_caller<1, 3, 4>, sparse_caller<1, 4, 4>, sparse_caller<1, 5, 4>},
+            {sparse_caller<1, 1, 5>, sparse_caller<1, 2, 5>, sparse_caller<1, 3, 5>, sparse_caller<1, 4, 5>, sparse_caller<1, 5, 5>}
+        };
+
+        bindTexture(&tex_If, I);
+        bindTexture(&tex_Jf, J);
+
+        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
+            level, block, stream);
+    }
+
+    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                 int level, dim3 block, dim3 patch, cudaStream_t stream)
+    {
+        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                               int level, dim3 block, cudaStream_t stream);
+
+        static const func_t funcs[5][5] =
+        {
+            {sparse_caller<4, 1, 1>, sparse_caller<4, 2, 1>, sparse_caller<4, 3, 1>, sparse_caller<4, 4, 1>, sparse_caller<4, 5, 1>},
+            {sparse_caller<4, 1, 2>, sparse_caller<4, 2, 2>, sparse_caller<4, 3, 2>, sparse_caller<4, 4, 2>, sparse_caller<4, 5, 2>},
+            {sparse_caller<4, 1, 3>, sparse_caller<4, 2, 3>, sparse_caller<4, 3, 3>, sparse_caller<4, 4, 3>, sparse_caller<4, 5, 3>},
+            {sparse_caller<4, 1, 4>, sparse_caller<4, 2, 4>, sparse_caller<4, 3, 4>, sparse_caller<4, 4, 4>, sparse_caller<4, 5, 4>},
+            {sparse_caller<4, 1, 5>, sparse_caller<4, 2, 5>, sparse_caller<4, 3, 5>, sparse_caller<4, 4, 5>, sparse_caller<4, 5, 5>}
+        };
+
+        bindTexture(&tex_If4, I);
+        bindTexture(&tex_Jf4, J);
+
+        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
+            level, block, stream);
+    }
+
+    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
    {
        dim3 block(16, 16);
        dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
@ -666,12 +548,12 @@ namespace cv { namespace gpu { namespace device

        if (err.data)
        {
-                lkDense<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
+            denseKernel<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
            cudaSafeCall( cudaGetLastError() );
        }
        else
        {
-                lkDense<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
+            denseKernel<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
            cudaSafeCall( cudaGetLastError() );
        }

@ -679,6 +561,5 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }
-}}}

 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@ -69,7 +69,7 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
        {
-            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
+            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
            {
                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;

@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, int)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, bool)
            {
                (void)srcWhole;
                (void)xoff;
@ -124,10 +124,10 @@ namespace cv { namespace gpu { namespace device
            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
            { \
                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float* borderValue, int cc) \
+                    PtrStepSz< type > dst, const float* borderValue, bool cc20) \
                { \
                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc >= 20 ? 8 : 4); \
+                    dim3 block(32, cc20 ? 8 : 4); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
                    bindTexture(&tex_remap_ ## type , srcWhole); \
                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
@ -142,7 +142,7 @@ namespace cv { namespace gpu { namespace device
            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
            { \
                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float*, int) \
+                    PtrStepSz< type > dst, const float*, bool) \
                { \
                    dim3 block(32, 8); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
@ -194,20 +194,20 @@ namespace cv { namespace gpu { namespace device
        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
        {
            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
            {
                if (stream == 0)
-                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc);
+                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc20);
                else
-                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
+                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc20);
            }
        };

        template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
        {
            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);

            static const caller_t callers[3][5] =
            {
@ -235,38 +235,38 @@ namespace cv { namespace gpu { namespace device
            };

            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
        }

-        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device

--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@ -42,9 +42,11 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/gpu/device/functional.hpp"

 namespace cv { namespace gpu { namespace device
 {
@ -297,28 +299,13 @@ namespace cv { namespace gpu { namespace device
                }

                extern __shared__ float smem[];
-                float* dline = smem + winsz * threadIdx.z;

-                dline[tid] = val;
-
-                __syncthreads();
-
-                if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
-                if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }
-
-                volatile float* vdline = smem + winsz * threadIdx.z;
-
-                if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
-                if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
-                if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];
-                if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];
-                if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];
-                if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];
+                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());

                T* data_cost = (T*)ctemp + y_out * cmsg_step + x_out;

                if (tid == 0)
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
+                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
            }
        }

@ -496,26 +483,11 @@ namespace cv { namespace gpu { namespace device
                }

                extern __shared__ float smem[];
-                float* dline = smem + winsz * threadIdx.z;

-                dline[tid] = val;
-
-                __syncthreads();
-
-                if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
-                if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid +  64]; } __syncthreads(); }
-
-                volatile float* vdline = smem + winsz * threadIdx.z;
-
-                if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
-                if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
-                if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];
-                if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];
-                if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];
-                if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];
+                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());

                if (tid == 0)
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
+                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
            }
        }

--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@ -47,13 +47,13 @@

 #if !defined CUDA_DISABLER

-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
 #include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/functional.hpp"
 #include "opencv2/gpu/device/filters.hpp"
-#include <float.h>

 namespace cv { namespace gpu { namespace device
 {
@ -568,7 +568,9 @@ namespace cv { namespace gpu { namespace device

            float bestx = 0, besty = 0, best_mod = 0;

+        #if __CUDA_ARCH__ >= 200
            #pragma unroll
+        #endif
            for (int i = 0; i < 18; ++i)
            {
                const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
@ -599,8 +601,9 @@ namespace cv { namespace gpu { namespace device
                    sumy += s_Y[threadIdx.x + 96];
                }

-                device::reduce<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
-                device::reduce<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
+                plus<float> op;
+                device::reduce<32>(smem_tuple(s_sumx + threadIdx.y * 32, s_sumy + threadIdx.y * 32),
+                                   thrust::tie(sumx, sumy), threadIdx.x, thrust::make_tuple(op, op));

                const float temp_mod = sumx * sumx + sumy * sumy;
                if (temp_mod > best_mod)
@ -638,7 +641,7 @@ namespace cv { namespace gpu { namespace device
                kp_dir *= 180.0f / CV_PI_F;

                kp_dir = 360.0f - kp_dir;
-                if (::fabsf(kp_dir - 360.f) < FLT_EPSILON)
+                if (::fabsf(kp_dir - 360.f) < numeric_limits<float>::epsilon())
                    kp_dir = 0.f;

                featureDir[blockIdx.x] = kp_dir;
@ -697,11 +700,6 @@ namespace cv { namespace gpu { namespace device
        {
            typedef uchar elem_type;

-            __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) :
-                centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)
-            {
-            }
-
            __device__ __forceinline__ uchar operator ()(int i, int j) const
            {
                float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
@ -715,285 +713,215 @@ namespace cv { namespace gpu { namespace device
            float win_offset;
            float cos_dir;
            float sin_dir;
+            int width;
+            int height;
        };

-        __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25],
-            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
+                                   float& dx, float& dy)
        {
-            __shared__ float s_PATCH[6][6];
+            __shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];

-            const float centerX = featureX[blockIdx.x];
-            const float centerY = featureY[blockIdx.x];
-            const float size = featureSize[blockIdx.x];
-            float descriptor_dir = 360.0f - featureDir[blockIdx.x];
-            if (std::abs(descriptor_dir - 360.f) < FLT_EPSILON)
-                descriptor_dir = 0.f;
-            descriptor_dir *= (float)(CV_PI_F / 180.0f);
+            dx = dy = 0.0f;

-            /* The sampling intervals and wavelet sized for selecting an orientation
-             and building the keypoint descriptor are defined relative to 's' */
-            const float s = size * 1.2f / 9.0f;
+            WinReader win;

-            /* Extract a window of pixels around the keypoint of size 20s */
+            win.centerX = featureX[blockIdx.x];
+            win.centerY = featureY[blockIdx.x];
+
+            // The sampling intervals and wavelet sized for selecting an orientation
+            // and building the keypoint descriptor are defined relative to 's'
+            const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
+
+            // Extract a window of pixels around the keypoint of size 20s
            const int win_size = (int)((PATCH_SZ + 1) * s);

-            float sin_dir;
-            float cos_dir;
-            sincosf(descriptor_dir, &sin_dir, &cos_dir);
+            win.width = win.height = win_size;

-            /* Nearest neighbour version (faster) */
-            const float win_offset = -(float)(win_size - 1) / 2;
-
-            // Compute sampling points
-            // since grids are 2D, need to compute xBlock and yBlock indices
-            const int xBlock = (blockIdx.y & 3);  // blockIdx.y % 4
-            const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
-            const int xIndex = xBlock * 5 + threadIdx.x;
-            const int yIndex = yBlock * 5 + threadIdx.y;
-
-            const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
-            const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
-
-            LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));
-
-            s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);
-
-            __syncthreads();
-
-            if (threadIdx.x < 5 && threadIdx.y < 5)
-            {
-                const int tid = threadIdx.y * 5 + threadIdx.x;
-
-                const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
-
-                const float vx = (s_PATCH[threadIdx.y    ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x    ]) * dw;
-                const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x    ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y    ][threadIdx.x + 1]) * dw;
-
-                s_dx_bin[tid] = vx;
-                s_dy_bin[tid] = vy;
-            }
-        }
-
-        __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)
-        {
-            // first step is to reduce from 25 to 16
-            if (tid < 9) // use 9 threads
-            {
-                sdata1[tid] += sdata1[tid + 16];
-                sdata2[tid] += sdata2[tid + 16];
-                sdata3[tid] += sdata3[tid + 16];
-                sdata4[tid] += sdata4[tid + 16];
-            }
-
-            // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
-            if (tid < 8)
-            {
-                sdata1[tid] += sdata1[tid + 8];
-                sdata1[tid] += sdata1[tid + 4];
-                sdata1[tid] += sdata1[tid + 2];
-                sdata1[tid] += sdata1[tid + 1];
-
-                sdata2[tid] += sdata2[tid + 8];
-                sdata2[tid] += sdata2[tid + 4];
-                sdata2[tid] += sdata2[tid + 2];
-                sdata2[tid] += sdata2[tid + 1];
-
-                sdata3[tid] += sdata3[tid + 8];
-                sdata3[tid] += sdata3[tid + 4];
-                sdata3[tid] += sdata3[tid + 2];
-                sdata3[tid] += sdata3[tid + 1];
-
-                sdata4[tid] += sdata4[tid + 8];
-                sdata4[tid] += sdata4[tid + 4];
-                sdata4[tid] += sdata4[tid + 2];
-                sdata4[tid] += sdata4[tid + 1];
-            }
-        }
-
-        __global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
-        {
-            // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
-            __shared__ float sdx[25];
-            __shared__ float sdy[25];
-            __shared__ float sdxabs[25];
-            __shared__ float sdyabs[25];
-
-            calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
-            __syncthreads();
+            // Nearest neighbour version (faster)
+            win.win_offset = -(win_size - 1.0f) / 2.0f;

+            float descriptor_dir = 360.0f - featureDir[blockIdx.x];
+            if (::fabsf(descriptor_dir - 360.f) < numeric_limits<float>::epsilon())
+                descriptor_dir = 0.f;
+            descriptor_dir *= CV_PI_F / 180.0f;
+            sincosf(descriptor_dir, &win.sin_dir, &win.cos_dir);

            const int tid = threadIdx.y * blockDim.x + threadIdx.x;

-            if (tid < 25)
+            const int xLoadInd = tid % (PATCH_SZ + 1);
+            const int yLoadInd = tid / (PATCH_SZ + 1);
+
+            if (yLoadInd < (PATCH_SZ + 1))
            {
-                sdxabs[tid] = ::fabs(sdx[tid]); // |dx| array
-                sdyabs[tid] = ::fabs(sdy[tid]); // |dy| array
+                if (s > 1)
+                {
+                    AreaFilter<WinReader> filter(win, s, s);
+                    s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd, xLoadInd);
+                }
+                else
+                {
+                    LinearFilter<WinReader> filter(win);
+                    s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd * s, xLoadInd * s);
+                }
+            }
+
            __syncthreads();

-                reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
-                __syncthreads();
+            const int xPatchInd = threadIdx.x % 5;
+            const int yPatchInd = threadIdx.x / 5;

-                float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);
+            if (yPatchInd < 5)
+            {
+                const int xBlockInd = threadIdx.y % 4;
+                const int yBlockInd = threadIdx.y / 4;
+
+                const int xInd = xBlockInd * 5 + xPatchInd;
+                const int yInd = yBlockInd * 5 + yPatchInd;
+
+                const float dw = c_DW[yInd * PATCH_SZ + xInd];
+
+                dx = (s_PATCH[yInd    ][xInd + 1] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd + 1][xInd    ]) * dw;
+                dy = (s_PATCH[yInd + 1][xInd    ] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd    ][xInd + 1]) * dw;
+            }
+        }
+
+        __global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        {
+            __shared__ float smem[32 * 16];
+
+            float* sRow = smem + threadIdx.y * 32;
+
+            float dx, dy;
+            calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
+
+            float dxabs = ::fabsf(dx);
+            float dyabs = ::fabsf(dy);
+
+            plus<float> op;
+
+            reduce<32>(sRow, dx, threadIdx.x, op);
+            reduce<32>(sRow, dy, threadIdx.x, op);
+            reduce<32>(sRow, dxabs, threadIdx.x, op);
+            reduce<32>(sRow, dyabs, threadIdx.x, op);
+
+            float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y;

            // write dx, dy, |dx|, |dy|
-                if (tid == 0)
-                {
-                    descriptors_block[0] = sdx[0];
-                    descriptors_block[1] = sdy[0];
-                    descriptors_block[2] = sdxabs[0];
-                    descriptors_block[3] = sdyabs[0];
-                }
-            }
+            if (threadIdx.x == 0)
+                *descriptors_block = make_float4(dx, dy, dxabs, dyabs);
        }

-        __global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        __global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
        {
-            // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
-            __shared__ float sdx[25];
-            __shared__ float sdy[25];
+            __shared__ float smem[32 * 16];

-            // sum (reduce) 5x5 area response
-            __shared__ float sd1[25];
-            __shared__ float sd2[25];
-            __shared__ float sdabs1[25];
-            __shared__ float sdabs2[25];
+            float* sRow = smem + threadIdx.y * 32;

-            calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
-            __syncthreads();
+            float dx, dy;
+            calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);

-            const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+            float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;

-            if (tid < 25)
+            plus<float> op;
+
+            float d1 = 0.0f;
+            float d2 = 0.0f;
+            float abs1 = 0.0f;
+            float abs2 = 0.0f;
+
+            if (dy >= 0)
            {
-                if (sdy[tid] >= 0)
-                {
-                    sd1[tid] = sdx[tid];
-                    sdabs1[tid] = ::fabs(sdx[tid]);
-                    sd2[tid] = 0;
-                    sdabs2[tid] = 0;
+                d1 = dx;
+                abs1 = ::fabsf(dx);
            }
            else
            {
-                    sd1[tid] = 0;
-                    sdabs1[tid] = 0;
-                    sd2[tid] = sdx[tid];
-                    sdabs2[tid] = ::fabs(sdx[tid]);
+                d2 = dx;
+                abs2 = ::fabsf(dx);
            }
-                __syncthreads();

-                reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-                __syncthreads();
-
-                float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
+            reduce<32>(sRow, d1, threadIdx.x, op);
+            reduce<32>(sRow, d2, threadIdx.x, op);
+            reduce<32>(sRow, abs1, threadIdx.x, op);
+            reduce<32>(sRow, abs2, threadIdx.x, op);

            // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
-                if (tid == 0)
-                {
-                    descriptors_block[0] = sd1[0];
-                    descriptors_block[1] = sdabs1[0];
-                    descriptors_block[2] = sd2[0];
-                    descriptors_block[3] = sdabs2[0];
-                }
-                __syncthreads();
+            if (threadIdx.x == 0)
+                descriptors_block[0] = make_float4(d1, abs1, d2, abs2);

-                if (sdx[tid] >= 0)
+            if (dx >= 0)
            {
-                    sd1[tid] = sdy[tid];
-                    sdabs1[tid] = ::fabs(sdy[tid]);
-                    sd2[tid] = 0;
-                    sdabs2[tid] = 0;
+                d1 = dy;
+                abs1 = ::fabsf(dy);
+                d2 = 0.0f;
+                abs2 = 0.0f;
            }
            else
            {
-                    sd1[tid] = 0;
-                    sdabs1[tid] = 0;
-                    sd2[tid] = sdy[tid];
-                    sdabs2[tid] = ::fabs(sdy[tid]);
+                d1 = 0.0f;
+                abs1 = 0.0f;
+                d2 = dy;
+                abs2 = ::fabsf(dy);
            }
-                __syncthreads();

-                reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-                __syncthreads();
+            reduce<32>(sRow, d1, threadIdx.x, op);
+            reduce<32>(sRow, d2, threadIdx.x, op);
+            reduce<32>(sRow, abs1, threadIdx.x, op);
+            reduce<32>(sRow, abs2, threadIdx.x, op);

            // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
-                if (tid == 0)
-                {
-                    descriptors_block[4] = sd1[0];
-                    descriptors_block[5] = sdabs1[0];
-                    descriptors_block[6] = sd2[0];
-                    descriptors_block[7] = sdabs2[0];
-                }
-            }
+            if (threadIdx.x == 0)
+                descriptors_block[1] = make_float4(d1, abs1, d2, abs2);
        }

        template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
        {
+            __shared__ float smem[BLOCK_DIM_X];
+            __shared__ float s_len;
+
            // no need for thread ID
            float* descriptor_base = descriptors.ptr(blockIdx.x);

            // read in the unnormalized descriptor values (squared)
-            __shared__ float sqDesc[BLOCK_DIM_X];
-            const float lookup = descriptor_base[threadIdx.x];
-            sqDesc[threadIdx.x] = lookup * lookup;
-            __syncthreads();
+            const float val = descriptor_base[threadIdx.x];

-            if (BLOCK_DIM_X >= 128)
-            {
-                if (threadIdx.x < 64)
-                    sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
-                __syncthreads();
-            }
+            float len = val * val;
+            reduce<BLOCK_DIM_X>(smem, len, threadIdx.x, plus<float>());

-            // reduction to get total
-            if (threadIdx.x < 32)
-            {
-                volatile float* smem = sqDesc;
-
-                smem[threadIdx.x] += smem[threadIdx.x + 32];
-                smem[threadIdx.x] += smem[threadIdx.x + 16];
-                smem[threadIdx.x] += smem[threadIdx.x + 8];
-                smem[threadIdx.x] += smem[threadIdx.x + 4];
-                smem[threadIdx.x] += smem[threadIdx.x + 2];
-                smem[threadIdx.x] += smem[threadIdx.x + 1];
-            }
-
-            // compute length (square root)
-            __shared__ float len;
            if (threadIdx.x == 0)
-            {
-                len = sqrtf(sqDesc[0]);
-            }
+                s_len = ::sqrtf(len);
+
            __syncthreads();

            // normalize and store in output
-            descriptor_base[threadIdx.x] = lookup / len;
+            descriptor_base[threadIdx.x] = val / s_len;
        }

-        void compute_descriptors_gpu(const PtrStepSzf& descriptors,
-            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
+        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
        {
            // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D

            if (descriptors.cols == 64)
            {
-                compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
+                compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
                cudaSafeCall( cudaGetLastError() );

                cudaSafeCall( cudaDeviceSynchronize() );

-                normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
+                normalize_descriptors<64><<<nFeatures, 64>>>((PtrStepSzf) descriptors);
                cudaSafeCall( cudaGetLastError() );

                cudaSafeCall( cudaDeviceSynchronize() );
            }
            else
            {
-                compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
+                compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
                cudaSafeCall( cudaGetLastError() );

                cudaSafeCall( cudaDeviceSynchronize() );

-                normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
+                normalize_descriptors<128><<<nFeatures, 128>>>((PtrStepSzf) descriptors);
                cudaSafeCall( cudaGetLastError() );

                cudaSafeCall( cudaDeviceSynchronize() );
--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace device

        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
            {
                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;

@ -158,7 +158,7 @@ namespace cv { namespace gpu { namespace device

        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, int)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, bool)
            {
                (void)xoff;
                (void)yoff;
@ -195,10 +195,10 @@ namespace cv { namespace gpu { namespace device
            }; \
            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, int cc) \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, bool cc20) \
                { \
                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc >= 20 ? 8 : 4); \
+                    dim3 block(32, cc20 ? 8 : 4); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
                    bindTexture(&tex_warp_ ## type , srcWhole); \
                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
@ -212,7 +212,7 @@ namespace cv { namespace gpu { namespace device
            }; \
            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, int) \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, bool) \
                { \
                    dim3 block(32, 8); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
@ -263,20 +263,20 @@ namespace cv { namespace gpu { namespace device

        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
            {
                if (stream == 0)
-                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc);
+                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc20);
                else
-                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc);
+                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc20);
            }
        };

        template <class Transform, typename T>
        void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
-                         int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+                         int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
        {
-            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
+            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);

            static const func_t funcs[3][5] =
            {
@ -304,84 +304,84 @@ namespace cv { namespace gpu { namespace device
            };

            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
        }

        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
        {
            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );

-            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
+            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
        }

-        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

        template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
        {
            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );

-            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
+            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
        }

-        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device

--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/fgd_bgfg.cpp
+++ b/modules/gpu/src/fgd_bgfg.cpp
@ -336,7 +336,7 @@ namespace
 {
    void calcDiffHistogram(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::gpu::GpuMat& hist, cv::gpu::GpuMat& histBuf)
    {
-        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
        static const func_t funcs[4][4] =
        {
            {0,0,0,0},
@ -348,14 +348,11 @@ namespace
        hist.create(3, 256, CV_32SC1);
        histBuf.create(3, bgfg::PARTIAL_HISTOGRAM_COUNT * bgfg::HISTOGRAM_BIN_COUNT, CV_32SC1);

-        cv::gpu::DeviceInfo devInfo;
-        int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
-
        funcs[prevFrame.channels() - 1][curFrame.channels() - 1](
                    prevFrame, curFrame,
                    hist.ptr<unsigned int>(0), hist.ptr<unsigned int>(1), hist.ptr<unsigned int>(2),
                    histBuf.ptr<unsigned int>(0), histBuf.ptr<unsigned int>(1), histBuf.ptr<unsigned int>(2),
-                    cc, 0);
+                    cv::gpu::deviceSupports(cv::gpu::FEATURE_SET_COMPUTE_20), 0);
    }

    void calcRelativeVariance(unsigned int hist[3 * 256], double relativeVariance[3][bgfg::HISTOGRAM_BIN_COUNT])
@ -526,15 +523,15 @@ namespace

        size_t total = all_contours.size();

-        _contours.create(total, 1, 0, -1, true);
+        _contours.create((int) total, 1, 0, -1, true);

        cv::SeqIterator<CvSeq*> it = all_contours.begin();
        for (size_t i = 0; i < total; ++i, ++it)
        {
            CvSeq* c = *it;
            ((CvContour*)c)->color = (int)i;
-            _contours.create((int)c->total, 1, CV_32SC2, i, true);
-            cv::Mat ci = _contours.getMat(i);
+            _contours.create((int)c->total, 1, CV_32SC2, (int)i, true);
+            cv::Mat ci = _contours.getMat((int)i);
            CV_Assert( ci.isContinuous() );
            cvCvtSeqToArray(c, ci.data);
        }
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@ -294,9 +294,8 @@ void cv::gpu::HoughCircles(const GpuMat& src, GpuMat& circles, HoughCirclesBuf&

    ensureSizeIsEnough(1, maxCircles, CV_32FC3, circles);

-    DeviceInfo devInfo;
    const int circlesCount = circlesAccumRadius_gpu(centers, centersCount, srcPoints, pointsCount, circles.ptr<float3>(), maxCircles,
-                                                    dp, minRadius, maxRadius, votesThreshold, devInfo.supports(FEATURE_SET_COMPUTE_20));
+                                                    dp, minRadius, maxRadius, votesThreshold, deviceSupports(FEATURE_SET_COMPUTE_20));

    if (circlesCount > 0)
        circles.cols = circlesCount;
@ -531,7 +530,7 @@ namespace
        const func_t func = funcs[dx.depth()];
        CV_Assert(func != 0);

-        edgePointList.cols = edgePointList.step / sizeof(int);
+        edgePointList.cols = (int) (edgePointList.step / sizeof(int));
        ensureSizeIsEnough(2, edges.size().area(), CV_32SC1, edgePointList);

        edgePointList.cols = func(edges, dx, dy, edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1));
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@ -547,14 +547,13 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S

    cudaStream_t stream = StreamAccessor::getStream(s);

-    DeviceInfo info;
    cv::Size whole;
    cv::Point offset;

    src.locateROI(whole, offset);

-    if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
-        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (src.step - offset.x))
+    if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
+        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
    {
        ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);

@ -972,36 +971,26 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4
    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
 }

-namespace cv { namespace gpu { namespace device
-{
 namespace hist
 {
-        void histogram256_gpu(PtrStepSzb src, int* hist, unsigned int* buf, cudaStream_t stream);
-
-        const int PARTIAL_HISTOGRAM256_COUNT = 240;
-        const int HISTOGRAM256_BIN_COUNT     = 256;
-
-        void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
+    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
 }
-}}}

 void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
 {
-    GpuMat buf;
-    calcHist(src, hist, buf, stream);
+    CV_Assert(src.type() == CV_8UC1);
+
+    hist.create(1, 256, CV_32SC1);
+    hist.setTo(Scalar::all(0));
+
+    hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
 }

 void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream)
 {
-    using namespace ::cv::gpu::device::hist;
-
-    CV_Assert(src.type() == CV_8UC1);
-
-    hist.create(1, 256, CV_32SC1);
-
-    ensureSizeIsEnough(1, PARTIAL_HISTOGRAM256_COUNT * HISTOGRAM256_BIN_COUNT, CV_32SC1, buf);
-
-    histogram256_gpu(src, hist.ptr<int>(), buf.ptr<unsigned int>(), StreamAccessor::getStream(stream));
+    (void) buf;
+    calcHist(src, hist, stream);
 }

 void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream)
@ -1019,8 +1008,6 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream&

 void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s)
 {
-    using namespace ::cv::gpu::device::hist;
-
    CV_Assert(src.type() == CV_8UC1);

    dst.create(src.size(), src.type());
@ -1028,15 +1015,12 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
    int intBufSize;
    nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) );

-    int bufSize = static_cast<int>(std::max(256 * 240 * sizeof(int), intBufSize + 256 * sizeof(int)));
+    ensureSizeIsEnough(1, intBufSize + 256 * sizeof(int), CV_8UC1, buf);

-    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
-
-    GpuMat histBuf(1, 256 * 240, CV_32SC1, buf.ptr());
    GpuMat intBuf(1, intBufSize, CV_8UC1, buf.ptr());
    GpuMat lut(1, 256, CV_32S, buf.ptr() + intBufSize);

-    calcHist(src, hist, histBuf, s);
+    calcHist(src, hist, s);

    cudaStream_t stream = StreamAccessor::getStream(s);

@ -1044,10 +1028,7 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&

    nppSafeCall( nppsIntegral_32s(hist.ptr<Npp32s>(), lut.ptr<Npp32s>(), 256, intBuf.ptr<Npp8u>()) );

-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-
-    equalizeHist_gpu(src, dst, lut.ptr<int>(), stream);
+    hist::equalizeHist(src, dst, lut.ptr<int>(), stream);
 }

 ////////////////////////////////////////////////////////////////////////
@ -1448,35 +1429,31 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
 //////////////////////////////////////////////////////////////////////////////
 // Canny

-cv::gpu::CannyBuf::CannyBuf(const GpuMat& dx_, const GpuMat& dy_) : dx(dx_), dy(dy_)
+cv::gpu::CannyBuf::CannyBuf(const GpuMat& dx_, const GpuMat& dy_)
 {
-    CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size());
-
-    create(dx_.size(), -1);
+    (void) dx_;
+    (void) dy_;
 }

 void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
+{
+    if (apperture_size > 0)
    {
        ensureSizeIsEnough(image_size, CV_32SC1, dx);
        ensureSizeIsEnough(image_size, CV_32SC1, dy);

-    if (apperture_size == 3)
+        if (apperture_size != 3)
        {
-        ensureSizeIsEnough(image_size, CV_32SC1, dx_buf);
-        ensureSizeIsEnough(image_size, CV_32SC1, dy_buf);
-    }
-    else if(apperture_size > 0)
-    {
-        if (!filterDX)
            filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
-        if (!filterDY)
            filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
        }
+    }

-    ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, edgeBuf);
+    ensureSizeIsEnough(image_size, CV_32FC1, edgeBuf);
+    ensureSizeIsEnough(image_size, CV_32SC1, dx_buf);

-    ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1);
-    ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2);
+    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf1);
+    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf2);
 }

 void cv::gpu::CannyBuf::release()
@ -1490,91 +1467,86 @@ void cv::gpu::CannyBuf::release()
    trackBuf2.release();
 }

-namespace cv { namespace gpu { namespace device
-{
 namespace canny
 {
-        void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);

-        void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
-        void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);

-        void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh);
+    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1);

-        void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols);
+    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2);

-        void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);
-
-        void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);
+    void getEdges(PtrStepSzi map, PtrStepSzb dst);
 }
-}}}

 namespace
 {
-    void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
+    void CannyCaller(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
    {
-        using namespace ::cv::gpu::device::canny;
+        using namespace canny;

-        calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);
+        calcMap(dx, dy, buf.edgeBuf, buf.dx_buf, low_thresh, high_thresh);

-        edgesHysteresisLocal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), dst.rows, dst.cols);
+        edgesHysteresisLocal(buf.dx_buf, buf.trackBuf1.ptr<ushort2>());

-        edgesHysteresisGlobal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), buf.trackBuf2.ptr<ushort2>(), dst.rows, dst.cols);
+        edgesHysteresisGlobal(buf.dx_buf, buf.trackBuf1.ptr<ushort2>(), buf.trackBuf2.ptr<ushort2>());

-        getEdges_gpu(buf.edgeBuf, dst, dst.rows, dst.cols);
+        getEdges(buf.dx_buf, dst);
    }
 }

 void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
 {
-    CannyBuf buf(src.size(), apperture_size);
+    CannyBuf buf;
    Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient);
 }

 void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
 {
-    using namespace ::cv::gpu::device::canny;
+    using namespace canny;

    CV_Assert(src.type() == CV_8UC1);

-    if (!TargetArchs::builtWith(SHARED_ATOMICS) || !DeviceInfo().supports(SHARED_ATOMICS))
+    if (!deviceSupports(SHARED_ATOMICS))
        CV_Error(CV_StsNotImplemented, "The device doesn't support shared atomics");

    if( low_thresh > high_thresh )
        std::swap( low_thresh, high_thresh);

    dst.create(src.size(), CV_8U);
-    dst.setTo(Scalar::all(0));
-
    buf.create(src.size(), apperture_size);
-    buf.edgeBuf.setTo(Scalar::all(0));

    if (apperture_size == 3)
    {
-        calcSobelRowPass_gpu(src, buf.dx_buf, buf.dy_buf, src.rows, src.cols);
+        Size wholeSize;
+        Point ofs;
+        src.locateROI(wholeSize, ofs);
+        GpuMat srcWhole(wholeSize, src.type(), src.datastart, src.step);

-        calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
+        calcMagnitude(srcWhole, ofs.x, ofs.y, buf.dx, buf.dy, buf.edgeBuf, L2gradient);
    }
    else
    {
        buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
        buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));

-        calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
+        calcMagnitude(buf.dx, buf.dy, buf.edgeBuf, L2gradient);
    }

-    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+    CannyCaller(buf.dx, buf.dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
 }

 void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
 {
-    CannyBuf buf(dx, dy);
+    CannyBuf buf;
    Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient);
 }

 void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
 {
-    using namespace ::cv::gpu::device::canny;
+    using namespace canny;

    CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
    CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
@ -1583,17 +1555,11 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& d
        std::swap( low_thresh, high_thresh);

    dst.create(dx.size(), CV_8U);
-    dst.setTo(Scalar::all(0));
-
-    buf.dx = dx; buf.dy = dy;
    buf.create(dx.size(), -1);
-    buf.edgeBuf.setTo(Scalar::all(0));

-    calcMagnitude_gpu(dx, dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient);
+    calcMagnitude(dx, dy, buf.edgeBuf, L2gradient);

-    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+    CannyCaller(dx, dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
 }

 #endif /* !defined (HAVE_CUDA) */
-
-
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@ -118,7 +118,7 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat
 {
    CV_Assert(src.type() == CV_8UC1);

-    if (!TargetArchs::builtWith(FEATURE_SET_COMPUTE_13) || !DeviceInfo().supports(FEATURE_SET_COMPUTE_13))
+    if (!deviceSupports(FEATURE_SET_COMPUTE_13))
        CV_Error(CV_StsNotImplemented, "Not sufficient compute capebility");

    NppiSize sz;
@ -204,34 +204,19 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
 ////////////////////////////////////////////////////////////////////////
 // Sum

-namespace cv { namespace gpu { namespace device
-{
-    namespace matrix_reductions
-    {
 namespace sum
 {
-            template <typename T>
-            void sumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
+    void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows);

-            template <typename T>
-            void sumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
+    template <typename T, int cn>
+    void run(PtrStepSzb src, void* buf, double* sum);

-            template <typename T>
-            void absSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
+    template <typename T, int cn>
+    void runAbs(PtrStepSzb src, void* buf, double* sum);

-            template <typename T>
-            void absSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
-
-            template <typename T>
-            void sqrSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
-
-            template <typename T>
-            void sqrSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
-
-            void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
+    template <typename T, int cn>
+    void runSqr(PtrStepSzb src, void* buf, double* sum);
 }
-    }
-}}}

 Scalar cv::gpu::sum(const GpuMat& src)
 {
@ -239,43 +224,38 @@ Scalar cv::gpu::sum(const GpuMat& src)
    return sum(src, buf);
 }

-
 Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace cv::gpu::device::matrix_reductions::sum;
-
-    typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
-
-    static Caller multipass_callers[] =
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+    static const func_t funcs[7][5] =
    {
-        sumMultipassCaller<unsigned char>, sumMultipassCaller<char>,
-        sumMultipassCaller<unsigned short>, sumMultipassCaller<short>,
-        sumMultipassCaller<int>, sumMultipassCaller<float>
+        {0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
+        {0, ::sum::run<schar , 1>, ::sum::run<schar , 2>, ::sum::run<schar , 3>, ::sum::run<schar , 4>},
+        {0, ::sum::run<ushort, 1>, ::sum::run<ushort, 2>, ::sum::run<ushort, 3>, ::sum::run<ushort, 4>},
+        {0, ::sum::run<short , 1>, ::sum::run<short , 2>, ::sum::run<short , 3>, ::sum::run<short , 4>},
+        {0, ::sum::run<int   , 1>, ::sum::run<int   , 2>, ::sum::run<int   , 3>, ::sum::run<int   , 4>},
+        {0, ::sum::run<float , 1>, ::sum::run<float , 2>, ::sum::run<float , 3>, ::sum::run<float , 4>},
+        {0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
    };

-    static Caller singlepass_callers[] = {
-        sumCaller<unsigned char>, sumCaller<char>,
-        sumCaller<unsigned short>, sumCaller<short>,
-        sumCaller<int>, sumCaller<float>
-    };
-
-    CV_Assert(src.depth() <= CV_32F);
-
-    Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
-    ensureSizeIsEnough(buf_size, CV_8U, buf);
-
-    Caller* callers = multipass_callers;
-    if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-        callers = singlepass_callers;
-
-    Caller caller = callers[src.depth()];
-
-    double result[4];
-    caller(src, buf, result, src.channels());
-    return Scalar(result[0], result[1], result[2], result[3]);
+    if (src.depth() == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }

+    Size buf_size;
+    ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
+    ensureSizeIsEnough(buf_size, CV_8U, buf);
+    buf.setTo(Scalar::all(0));
+
+    const func_t func = funcs[src.depth()][src.channels()];
+
+    double result[4];
+    func(src, buf.data, result);
+
+    return Scalar(result[0], result[1], result[2], result[3]);
+}

 Scalar cv::gpu::absSum(const GpuMat& src)
 {
@ -283,44 +263,38 @@ Scalar cv::gpu::absSum(const GpuMat& src)
    return absSum(src, buf);
 }

-
 Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace cv::gpu::device::matrix_reductions::sum;
-
-    typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
-
-    static Caller multipass_callers[] =
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+    static const func_t funcs[7][5] =
    {
-        absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>,
-        absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>,
-        absSumMultipassCaller<int>, absSumMultipassCaller<float>
+        {0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
+        {0, ::sum::runAbs<schar , 1>, ::sum::runAbs<schar , 2>, ::sum::runAbs<schar , 3>, ::sum::runAbs<schar , 4>},
+        {0, ::sum::runAbs<ushort, 1>, ::sum::runAbs<ushort, 2>, ::sum::runAbs<ushort, 3>, ::sum::runAbs<ushort, 4>},
+        {0, ::sum::runAbs<short , 1>, ::sum::runAbs<short , 2>, ::sum::runAbs<short , 3>, ::sum::runAbs<short , 4>},
+        {0, ::sum::runAbs<int   , 1>, ::sum::runAbs<int   , 2>, ::sum::runAbs<int   , 3>, ::sum::runAbs<int   , 4>},
+        {0, ::sum::runAbs<float , 1>, ::sum::runAbs<float , 2>, ::sum::runAbs<float , 3>, ::sum::runAbs<float , 4>},
+        {0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
    };

-    static Caller singlepass_callers[] =
+    if (src.depth() == CV_64F)
    {
-        absSumCaller<unsigned char>, absSumCaller<char>,
-        absSumCaller<unsigned short>, absSumCaller<short>,
-        absSumCaller<int>, absSumCaller<float>
-    };
-
-    CV_Assert(src.depth() <= CV_32F);
-
-    Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
-    ensureSizeIsEnough(buf_size, CV_8U, buf);
-
-    Caller* callers = multipass_callers;
-    if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-        callers = singlepass_callers;
-
-    Caller caller = callers[src.depth()];
-
-    double result[4];
-    caller(src, buf, result, src.channels());
-    return Scalar(result[0], result[1], result[2], result[3]);
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }

+    Size buf_size;
+    ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
+    ensureSizeIsEnough(buf_size, CV_8U, buf);
+    buf.setTo(Scalar::all(0));
+
+    const func_t func = funcs[src.depth()][src.channels()];
+
+    double result[4];
+    func(src, buf.data, result);
+
+    return Scalar(result[0], result[1], result[2], result[3]);
+}

 Scalar cv::gpu::sqrSum(const GpuMat& src)
 {
@ -328,70 +302,49 @@ Scalar cv::gpu::sqrSum(const GpuMat& src)
    return sqrSum(src, buf);
 }

-
 Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace cv::gpu::device::matrix_reductions::sum;
-
-    typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
-
-    static Caller multipass_callers[] =
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+    static const func_t funcs[7][5] =
    {
-        sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>,
-        sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>,
-        sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>
+        {0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
+        {0, ::sum::runSqr<schar , 1>, ::sum::runSqr<schar , 2>, ::sum::runSqr<schar , 3>, ::sum::runSqr<schar , 4>},
+        {0, ::sum::runSqr<ushort, 1>, ::sum::runSqr<ushort, 2>, ::sum::runSqr<ushort, 3>, ::sum::runSqr<ushort, 4>},
+        {0, ::sum::runSqr<short , 1>, ::sum::runSqr<short , 2>, ::sum::runSqr<short , 3>, ::sum::runSqr<short , 4>},
+        {0, ::sum::runSqr<int   , 1>, ::sum::runSqr<int   , 2>, ::sum::runSqr<int   , 3>, ::sum::runSqr<int   , 4>},
+        {0, ::sum::runSqr<float , 1>, ::sum::runSqr<float , 2>, ::sum::runSqr<float , 3>, ::sum::runSqr<float , 4>},
+        {0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
    };

-    static Caller singlepass_callers[7] =
+    if (src.depth() == CV_64F)
    {
-        sqrSumCaller<unsigned char>, sqrSumCaller<char>,
-        sqrSumCaller<unsigned short>, sqrSumCaller<short>,
-        sqrSumCaller<int>, sqrSumCaller<float>
-    };
-
-    CV_Assert(src.depth() <= CV_32F);
-
-    Caller* callers = multipass_callers;
-    if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-        callers = singlepass_callers;
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }

    Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
+    ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);
+    buf.setTo(Scalar::all(0));

-    Caller caller = callers[src.depth()];
+    const func_t func = funcs[src.depth()][src.channels()];

    double result[4];
-    caller(src, buf, result, src.channels());
+    func(src, buf.data, result);
+
    return Scalar(result[0], result[1], result[2], result[3]);
 }

 ////////////////////////////////////////////////////////////////////////
-// Find min or max
+// minMax

-namespace cv { namespace gpu { namespace device
+namespace minMax
 {
-    namespace matrix_reductions
-    {
-        namespace minmax
-        {
-            void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
+    void getBufSize(int cols, int rows, int& bufcols, int& bufrows);

    template <typename T>
-            void minMaxCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf);
-
-            template <typename T>
-            void minMaxMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-
-            template <typename T>
-            void minMaxMultipassCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf);
-
-            template <typename T>
-            void minMaxMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 }
-    }
-}}}
-

 void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
 {
@ -399,112 +352,49 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
    minMax(src, minVal, maxVal, mask, buf);
 }

-
 void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::minmax;
-
-    typedef void (*Caller)(const PtrStepSzb, double*, double*, PtrStepb);
-    typedef void (*MaskedCaller)(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-
-    static Caller multipass_callers[] =
+    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    static const func_t funcs[] =
    {
-        minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>,
-        minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>,
-        minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0
+        ::minMax::run<uchar>,
+        ::minMax::run<schar>,
+        ::minMax::run<ushort>,
+        ::minMax::run<short>,
+        ::minMax::run<int>,
+        ::minMax::run<float>,
+        ::minMax::run<double>
    };

-    static Caller singlepass_callers[] =
-    {
-        minMaxCaller<unsigned char>, minMaxCaller<char>,
-        minMaxCaller<unsigned short>, minMaxCaller<short>,
-        minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double>
-    };
-
-    static MaskedCaller masked_multipass_callers[] =
-    {
-        minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>,
-        minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
-        minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0
-    };
-
-    static MaskedCaller masked_singlepass_callers[] =
-    {
-        minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>,
-        minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>,
-        minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double>
-    };
-
-    CV_Assert(src.depth() <= CV_64F);
    CV_Assert( src.channels() == 1 );
-    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );

    if (src.depth() == CV_64F)
    {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }

-    double minVal_; if (!minVal) minVal = &minVal_;
-    double maxVal_; if (!maxVal) maxVal = &maxVal_;
-
    Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), buf_size.width, buf_size.height);
+    ::minMax::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);

-    if (mask.empty())
-    {
-        Caller* callers = multipass_callers;
-        if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-            callers = singlepass_callers;
+    const func_t func = funcs[src.depth()];

-        Caller caller = callers[src.type()];
-        CV_Assert(caller != 0);
-        caller(src, minVal, maxVal, buf);
+    double temp1, temp2;
+    func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf);
 }
-    else
-    {
-        MaskedCaller* callers = masked_multipass_callers;
-        if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-            callers = masked_singlepass_callers;
-
-        MaskedCaller caller = callers[src.type()];
-        CV_Assert(caller != 0);
-        caller(src, mask, minVal, maxVal, buf);
-    }
-}
-

 ////////////////////////////////////////////////////////////////////////
-// Locate min and max
+// minMaxLoc

-namespace cv { namespace gpu { namespace device
+namespace minMaxLoc
 {
-    namespace matrix_reductions
-    {
-        namespace minmaxloc
-        {
-            void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
-                                    int& b1rows, int& b2cols, int& b2rows);
+    void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows);

    template <typename T>
-            void minMaxLocCaller(const PtrStepSzb src, double* minval, double* maxval,
-                                 int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
-
-            template <typename T>
-            void minMaxLocMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
-                                     int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
-
-            template <typename T>
-            void minMaxLocMultipassCaller(const PtrStepSzb src, double* minval, double* maxval,
-                                          int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
-
-            template <typename T>
-            void minMaxLocMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
-                                              int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
+    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 }
-    }
-}}}

 void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
 {
@ -515,104 +405,49 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
 void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
                        const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::minmaxloc;
-
-    typedef void (*Caller)(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    typedef void (*MaskedCaller)(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-
-    static Caller multipass_callers[] =
+    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    static const func_t funcs[] =
    {
-        minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>,
-        minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>,
-        minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0
+        ::minMaxLoc::run<uchar>,
+        ::minMaxLoc::run<schar>,
+        ::minMaxLoc::run<ushort>,
+        ::minMaxLoc::run<short>,
+        ::minMaxLoc::run<int>,
+        ::minMaxLoc::run<float>,
+        ::minMaxLoc::run<double>
    };

-    static Caller singlepass_callers[] =
-    {
-        minMaxLocCaller<unsigned char>, minMaxLocCaller<char>,
-        minMaxLocCaller<unsigned short>, minMaxLocCaller<short>,
-        minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double>
-    };
-
-    static MaskedCaller masked_multipass_callers[] =
-    {
-        minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>,
-        minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>,
-        minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0
-    };
-
-    static MaskedCaller masked_singlepass_callers[] =
-    {
-        minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>,
-        minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>,
-        minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double>
-    };
-
-    CV_Assert(src.depth() <= CV_64F);
    CV_Assert( src.channels() == 1 );
-    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );

    if (src.depth() == CV_64F)
    {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }

-    double minVal_; if (!minVal) minVal = &minVal_;
-    double maxVal_; if (!maxVal) maxVal = &maxVal_;
-    int minLoc_[2];
-    int maxLoc_[2];
-
    Size valbuf_size, locbuf_size;
-    getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), valbuf_size.width,
-                       valbuf_size.height, locbuf_size.width, locbuf_size.height);
+    ::minMaxLoc::getBufSize(src.cols, src.rows, src.elemSize(), valbuf_size.width, valbuf_size.height, locbuf_size.width, locbuf_size.height);
    ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
    ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);

-    if (mask.empty())
-    {
-        Caller* callers = multipass_callers;
-        if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-            callers = singlepass_callers;
+    const func_t func = funcs[src.depth()];

-        Caller caller = callers[src.type()];
-        CV_Assert(caller != 0);
-        caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
-    }
-    else
-    {
-        MaskedCaller* callers = masked_multipass_callers;
-        if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-            callers = masked_singlepass_callers;
-
-        MaskedCaller caller = callers[src.type()];
-        CV_Assert(caller != 0);
-        caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
-    }
-
-    if (minLoc) { minLoc->x = minLoc_[0]; minLoc->y = minLoc_[1]; }
-    if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; }
+    double temp1, temp2;
+    Point temp3, temp4;
+    func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, minLoc ? &minLoc->x : &temp3.x, maxLoc ? &maxLoc->x : &temp4.x, valBuf, locBuf);
 }

 //////////////////////////////////////////////////////////////////////////////
-// Count non-zero elements
+// countNonZero

-namespace cv { namespace gpu { namespace device
+namespace countNonZero
 {
-    namespace matrix_reductions
-    {
-        namespace countnonzero
-        {
-            void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);
+    void getBufSize(int cols, int rows, int& bufcols, int& bufrows);

    template <typename T>
-            int countNonZeroCaller(const PtrStepSzb src, PtrStepb buf);
-
-            template <typename T>
-            int countNonZeroMultipassCaller(const PtrStepSzb src, PtrStepb buf);
+    int run(const PtrStepSzb src, PtrStep<unsigned int> buf);
 }
-    }
-}}}

 int cv::gpu::countNonZero(const GpuMat& src)
 {
@ -620,198 +455,213 @@ int cv::gpu::countNonZero(const GpuMat& src)
    return countNonZero(src, buf);
 }

-
 int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::countnonzero;
-
-    typedef int (*Caller)(const PtrStepSzb src, PtrStepb buf);
-
-    static Caller multipass_callers[7] =
+    typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    static const func_t funcs[] =
    {
-        countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>,
-        countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>,
-        countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0
+        ::countNonZero::run<uchar>,
+        ::countNonZero::run<schar>,
+        ::countNonZero::run<ushort>,
+        ::countNonZero::run<short>,
+        ::countNonZero::run<int>,
+        ::countNonZero::run<float>,
+        ::countNonZero::run<double>
    };

-    static Caller singlepass_callers[7] =
-    {
-        countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,
-        countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,
-        countNonZeroCaller<int>, countNonZeroCaller<float>, countNonZeroCaller<double> };
-
-    CV_Assert(src.depth() <= CV_64F);
    CV_Assert(src.channels() == 1);

    if (src.depth() == CV_64F)
    {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
    }

    Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height);
+    ::countNonZero::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);

-    Caller* callers = multipass_callers;
-    if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
-        callers = singlepass_callers;
+    const func_t func = funcs[src.depth()];

-    Caller caller = callers[src.type()];
-    CV_Assert(caller != 0);
-    return caller(src, buf);
+    return func(src, buf);
 }

 //////////////////////////////////////////////////////////////////////////////
 // reduce

-namespace cv { namespace gpu { namespace device
+namespace reduce
 {
-    namespace matrix_reductions
-    {
-        template <typename T, typename S, typename D> void reduceRows_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-        template <typename T, typename S, typename D> void reduceCols_gpu(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+    template <typename T, typename S, typename D>
+    void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    template <typename T, typename S, typename D>
+    void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
 }
-}}}

 void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
 {
-    using namespace ::cv::gpu::device::matrix_reductions;
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F);
+    CV_Assert( src.channels() <= 4 );
    CV_Assert( dim == 0 || dim == 1 );
    CV_Assert( reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN );

    if (dtype < 0)
        dtype = src.depth();

-    dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKETYPE(dtype, src.channels()));
+    dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));

    if (dim == 0)
    {
-        typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-
-        static const caller_t callers[6][6] =
+        typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+        static const func_t funcs[7][7] =
        {
            {
-                reduceRows_gpu<unsigned char, int, unsigned char>,
-                0/*reduceRows_gpu<unsigned char, int, signed char>*/,
-                0/*reduceRows_gpu<unsigned char, int, unsigned short>*/,
-                0/*reduceRows_gpu<unsigned char, int, short>*/,
-                reduceRows_gpu<unsigned char, int, int>,
-                reduceRows_gpu<unsigned char, int, float>
+                ::reduce::rows<unsigned char, int, unsigned char>,
+                0/*::reduce::rows<unsigned char, int, signed char>*/,
+                0/*::reduce::rows<unsigned char, int, unsigned short>*/,
+                0/*::reduce::rows<unsigned char, int, short>*/,
+                ::reduce::rows<unsigned char, int, int>,
+                ::reduce::rows<unsigned char, float, float>,
+                ::reduce::rows<unsigned char, double, double>
            },
            {
-                0/*reduceRows_gpu<signed char, int, unsigned char>*/,
-                0/*reduceRows_gpu<signed char, int, signed char>*/,
-                0/*reduceRows_gpu<signed char, int, unsigned short>*/,
-                0/*reduceRows_gpu<signed char, int, short>*/,
-                0/*reduceRows_gpu<signed char, int, int>*/,
-                0/*reduceRows_gpu<signed char, int, float>*/
+                0/*::reduce::rows<signed char, int, unsigned char>*/,
+                0/*::reduce::rows<signed char, int, signed char>*/,
+                0/*::reduce::rows<signed char, int, unsigned short>*/,
+                0/*::reduce::rows<signed char, int, short>*/,
+                0/*::reduce::rows<signed char, int, int>*/,
+                0/*::reduce::rows<signed char, float, float>*/,
+                0/*::reduce::rows<signed char, double, double>*/
            },
            {
-                0/*reduceRows_gpu<unsigned short, int, unsigned char>*/,
-                0/*reduceRows_gpu<unsigned short, int, signed char>*/,
-                reduceRows_gpu<unsigned short, int, unsigned short>,
-                0/*reduceRows_gpu<unsigned short, int, short>*/,
-                reduceRows_gpu<unsigned short, int, int>,
-                reduceRows_gpu<unsigned short, int, float>
+                0/*::reduce::rows<unsigned short, int, unsigned char>*/,
+                0/*::reduce::rows<unsigned short, int, signed char>*/,
+                ::reduce::rows<unsigned short, int, unsigned short>,
+                0/*::reduce::rows<unsigned short, int, short>*/,
+                ::reduce::rows<unsigned short, int, int>,
+                ::reduce::rows<unsigned short, float, float>,
+                ::reduce::rows<unsigned short, double, double>
            },
            {
-                0/*reduceRows_gpu<short, int, unsigned char>*/,
-                0/*reduceRows_gpu<short, int, signed char>*/,
-                0/*reduceRows_gpu<short, int, unsigned short>*/,
-                reduceRows_gpu<short, int, short>,
-                reduceRows_gpu<short, int, int>,
-                reduceRows_gpu<short, int, float>
+                0/*::reduce::rows<short, int, unsigned char>*/,
+                0/*::reduce::rows<short, int, signed char>*/,
+                0/*::reduce::rows<short, int, unsigned short>*/,
+                ::reduce::rows<short, int, short>,
+                ::reduce::rows<short, int, int>,
+                ::reduce::rows<short, float, float>,
+                ::reduce::rows<short, double, double>
            },
            {
-                0/*reduceRows_gpu<int, int, unsigned char>*/,
-                0/*reduceRows_gpu<int, int, signed char>*/,
-                0/*reduceRows_gpu<int, int, unsigned short>*/,
-                0/*reduceRows_gpu<int, int, short>*/,
-                reduceRows_gpu<int, int, int>,
-                reduceRows_gpu<int, int, float>
+                0/*::reduce::rows<int, int, unsigned char>*/,
+                0/*::reduce::rows<int, int, signed char>*/,
+                0/*::reduce::rows<int, int, unsigned short>*/,
+                0/*::reduce::rows<int, int, short>*/,
+                ::reduce::rows<int, int, int>,
+                ::reduce::rows<int, float, float>,
+                ::reduce::rows<int, double, double>
            },
            {
-                0/*reduceRows_gpu<float, float, unsigned char>*/,
-                0/*reduceRows_gpu<float, float, signed char>*/,
-                0/*reduceRows_gpu<float, float, unsigned short>*/,
-                0/*reduceRows_gpu<float, float, short>*/,
-                0/*reduceRows_gpu<float, float, int>*/,
-                reduceRows_gpu<float, float, float>
+                0/*::reduce::rows<float, float, unsigned char>*/,
+                0/*::reduce::rows<float, float, signed char>*/,
+                0/*::reduce::rows<float, float, unsigned short>*/,
+                0/*::reduce::rows<float, float, short>*/,
+                0/*::reduce::rows<float, float, int>*/,
+                ::reduce::rows<float, float, float>,
+                ::reduce::rows<float, double, double>
+            },
+            {
+                0/*::reduce::rows<double, double, unsigned char>*/,
+                0/*::reduce::rows<double, double, signed char>*/,
+                0/*::reduce::rows<double, double, unsigned short>*/,
+                0/*::reduce::rows<double, double, short>*/,
+                0/*::reduce::rows<double, double, int>*/,
+                0/*::reduce::rows<double, double, float>*/,
+                ::reduce::rows<double, double, double>
            }
        };

-        const caller_t func = callers[src.depth()][dst.depth()];
+        const func_t func = funcs[src.depth()][dst.depth()];

        if (!func)
            CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");

-        func(src.reshape(1), dst.reshape(1), reduceOp, StreamAccessor::getStream(stream));
+        func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream));
    }
    else
    {
-        typedef void (*caller_t)(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-
-        static const caller_t callers[6][6] =
+        typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+        static const func_t funcs[7][7] =
        {
            {
-                reduceCols_gpu<unsigned char, int, unsigned char>,
-                0/*reduceCols_gpu<unsigned char, int, signed char>*/,
-                0/*reduceCols_gpu<unsigned char, int, unsigned short>*/,
-                0/*reduceCols_gpu<unsigned char, int, short>*/,
-                reduceCols_gpu<unsigned char, int, int>,
-                reduceCols_gpu<unsigned char, int, float>
+                ::reduce::cols<unsigned char, int, unsigned char>,
+                0/*::reduce::cols<unsigned char, int, signed char>*/,
+                0/*::reduce::cols<unsigned char, int, unsigned short>*/,
+                0/*::reduce::cols<unsigned char, int, short>*/,
+                ::reduce::cols<unsigned char, int, int>,
+                ::reduce::cols<unsigned char, float, float>,
+                ::reduce::cols<unsigned char, double, double>
            },
            {
-                0/*reduceCols_gpu<signed char, int, unsigned char>*/,
-                0/*reduceCols_gpu<signed char, int, signed char>*/,
-                0/*reduceCols_gpu<signed char, int, unsigned short>*/,
-                0/*reduceCols_gpu<signed char, int, short>*/,
-                0/*reduceCols_gpu<signed char, int, int>*/,
-                0/*reduceCols_gpu<signed char, int, float>*/
+                0/*::reduce::cols<signed char, int, unsigned char>*/,
+                0/*::reduce::cols<signed char, int, signed char>*/,
+                0/*::reduce::cols<signed char, int, unsigned short>*/,
+                0/*::reduce::cols<signed char, int, short>*/,
+                0/*::reduce::cols<signed char, int, int>*/,
+                0/*::reduce::cols<signed char, float, float>*/,
+                0/*::reduce::cols<signed char, double, double>*/
            },
            {
-                0/*reduceCols_gpu<unsigned short, int, unsigned char>*/,
-                0/*reduceCols_gpu<unsigned short, int, signed char>*/,
-                reduceCols_gpu<unsigned short, int, unsigned short>,
-                0/*reduceCols_gpu<unsigned short, int, short>*/,
-                reduceCols_gpu<unsigned short, int, int>,
-                reduceCols_gpu<unsigned short, int, float>
+                0/*::reduce::cols<unsigned short, int, unsigned char>*/,
+                0/*::reduce::cols<unsigned short, int, signed char>*/,
+                ::reduce::cols<unsigned short, int, unsigned short>,
+                0/*::reduce::cols<unsigned short, int, short>*/,
+                ::reduce::cols<unsigned short, int, int>,
+                ::reduce::cols<unsigned short, float, float>,
+                ::reduce::cols<unsigned short, double, double>
            },
            {
-                0/*reduceCols_gpu<short, int, unsigned char>*/,
-                0/*reduceCols_gpu<short, int, signed char>*/,
-                0/*reduceCols_gpu<short, int, unsigned short>*/,
-                reduceCols_gpu<short, int, short>,
-                reduceCols_gpu<short, int, int>,
-                reduceCols_gpu<short, int, float>
+                0/*::reduce::cols<short, int, unsigned char>*/,
+                0/*::reduce::cols<short, int, signed char>*/,
+                0/*::reduce::cols<short, int, unsigned short>*/,
+                ::reduce::cols<short, int, short>,
+                ::reduce::cols<short, int, int>,
+                ::reduce::cols<short, float, float>,
+                ::reduce::cols<short, double, double>
            },
            {
-                0/*reduceCols_gpu<int, int, unsigned char>*/,
-                0/*reduceCols_gpu<int, int, signed char>*/,
-                0/*reduceCols_gpu<int, int, unsigned short>*/,
-                0/*reduceCols_gpu<int, int, short>*/,
-                reduceCols_gpu<int, int, int>,
-                reduceCols_gpu<int, int, float>
+                0/*::reduce::cols<int, int, unsigned char>*/,
+                0/*::reduce::cols<int, int, signed char>*/,
+                0/*::reduce::cols<int, int, unsigned short>*/,
+                0/*::reduce::cols<int, int, short>*/,
+                ::reduce::cols<int, int, int>,
+                ::reduce::cols<int, float, float>,
+                ::reduce::cols<int, double, double>
            },
            {
-                0/*reduceCols_gpu<float, unsigned char>*/,
-                0/*reduceCols_gpu<float, signed char>*/,
-                0/*reduceCols_gpu<float, unsigned short>*/,
-                0/*reduceCols_gpu<float, short>*/,
-                0/*reduceCols_gpu<float, int>*/,
-                reduceCols_gpu<float, float, float>
+                0/*::reduce::cols<float, float, unsigned char>*/,
+                0/*::reduce::cols<float, float, signed char>*/,
+                0/*::reduce::cols<float, float, unsigned short>*/,
+                0/*::reduce::cols<float, float, short>*/,
+                0/*::reduce::cols<float, float, int>*/,
+                ::reduce::cols<float, float, float>,
+                ::reduce::cols<float, double, double>
+            },
+            {
+                0/*::reduce::cols<double, double, unsigned char>*/,
+                0/*::reduce::cols<double, double, signed char>*/,
+                0/*::reduce::cols<double, double, unsigned short>*/,
+                0/*::reduce::cols<double, double, short>*/,
+                0/*::reduce::cols<double, double, int>*/,
+                0/*::reduce::cols<double, double, float>*/,
+                ::reduce::cols<double, double, double>
            }
        };

-        const caller_t func = callers[src.depth()][dst.depth()];
+        const func_t func = funcs[src.depth()][dst.depth()];

        if (!func)
            CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");

-        func(src, src.channels(), dst, reduceOp, StreamAccessor::getStream(stream));
+        func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream));
    }
 }

--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@ -65,6 +65,8 @@
 #include "NPP_staging/NPP_staging.hpp"
 #include "NCVRuntimeTemplates.hpp"
 #include "NCVHaarObjectDetection.hpp"
+#include "opencv2/gpu/device/warp.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"


 //==============================================================================
@ -81,6 +83,20 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th
 //assuming size <= WARP_SIZE and size is power of 2
 __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
 {
+#if __CUDA_ARCH__ >= 300
+    const unsigned int laneId = cv::gpu::device::Warp::laneId();
+
+    // scan on shuffl functions
+    #pragma unroll
+    for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
+    {
+        const Ncv32u n = cv::gpu::device::shfl_up(idata, i);
+        if (laneId >= i)
+              idata += n;
+    }
+
+    return idata;
+#else
    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
    s_Data[pos] = 0;
    pos += K_WARP_SIZE;
@ -93,6 +109,7 @@ __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
    s_Data[pos] += s_Data[pos - 16];

    return s_Data[pos];
+#endif
 }

 __device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@ -44,6 +44,8 @@
 #include <vector>
 #include <cuda_runtime.h>
 #include "NPP_staging.hpp"
+#include "opencv2/gpu/device/warp.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"


 texture<Ncv8u,  1, cudaReadModeElementType> tex8u;
@ -90,6 +92,36 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th
 //assuming size <= WARP_SIZE and size is power of 2
 template <class T>
 inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
+{
+#if __CUDA_ARCH__ >= 300
+    const unsigned int laneId = cv::gpu::device::Warp::laneId();
+
+    // scan on shuffl functions
+    #pragma unroll
+    for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
+    {
+        const T n = cv::gpu::device::shfl_up(idata, i);
+        if (laneId >= i)
+              idata += n;
+    }
+
+    return idata;
+#else
+    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
+    s_Data[pos] = 0;
+    pos += K_WARP_SIZE;
+    s_Data[pos] = idata;
+
+    s_Data[pos] += s_Data[pos - 1];
+    s_Data[pos] += s_Data[pos - 2];
+    s_Data[pos] += s_Data[pos - 4];
+    s_Data[pos] += s_Data[pos - 8];
+    s_Data[pos] += s_Data[pos - 16];
+
+    return s_Data[pos];
+#endif
+}
+inline __device__ Ncv64u warpScanInclusive(Ncv64u idata, volatile Ncv64u *s_Data)
 {
    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
    s_Data[pos] = 0;
--- a/modules/gpu/src/optical_flow_farneback.cpp
+++ b/modules/gpu/src/optical_flow_farneback.cpp
@ -172,7 +172,7 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_boxFilter(
        const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
        GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
 {
-    if (!isDeviceArch11_)
+    if (deviceSupports(FEATURE_SET_COMPUTE_12))
        device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
    else
        device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, S(streams[0]));
@ -191,7 +191,7 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_gaussianBlur(
        const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
        GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
 {
-    if (!isDeviceArch11_)
+    if (deviceSupports(FEATURE_SET_COMPUTE_12))
        device::optflow_farneback::gaussianBlur5Gpu(
                    M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
    else
@ -209,7 +209,7 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_gaussianBlur(
 void cv::gpu::FarnebackOpticalFlow::operator ()(
        const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s)
 {
-    CV_Assert(frame0.type() == CV_8U && frame1.type() == CV_8U);
+    CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
    CV_Assert(frame0.size() == frame1.size());
    CV_Assert(polyN == 5 || polyN == 7);
    CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6);
--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
@ -53,35 +53,32 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat&, const GpuMat&, GpuMat&, Gpu

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace device
-{
 namespace pyrlk
 {
    void loadConstants(int2 winSize, int iters);

-        void lkSparse1_gpu(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
-        void lkSparse4_gpu(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);

-        void lkDense_gpu(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
+    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
               PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
 }
-}}}

 namespace
 {
-    void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch, bool isDeviceArch11)
+    void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch)
    {
        if (winSize.width > 32 && winSize.width > 2 * winSize.height)
        {
-            block.x = isDeviceArch11 ? 16 : 32;
+            block.x = deviceSupports(FEATURE_SET_COMPUTE_12) ? 32 : 16;
            block.y = 8;
        }
        else
        {
            block.x = 16;
-            block.y = isDeviceArch11 ? 8 : 16;
+            block.y = deviceSupports(FEATURE_SET_COMPUTE_12) ? 16 : 8;
        }

        patch.x = (winSize.width  + block.x - 1) / block.x;
@ -93,8 +90,6 @@ namespace

 void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err)
 {
-    using namespace cv::gpu::device::pyrlk;
-
    if (prevPts.empty())
    {
        nextPts.release();
@ -104,9 +99,9 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
    }

    dim3 block, patch;
-    calcPatchSize(winSize, block, patch, isDeviceArch11_);
+    calcPatchSize(winSize, block, patch);

-    CV_Assert(prevImg.type() == CV_8UC1 || prevImg.type() == CV_8UC3 || prevImg.type() == CV_8UC4);
+    CV_Assert(prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4);
    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
    CV_Assert(maxLevel >= 0);
    CV_Assert(winSize.width > 2 && winSize.height > 2);
@ -155,19 +150,19 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
        pyrDown(nextPyr_[level - 1], nextPyr_[level]);
    }

-    loadConstants(make_int2(winSize.width, winSize.height), iters);
+    pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters);

    for (int level = maxLevel; level >= 0; level--)
    {
        if (cn == 1)
        {
-            lkSparse1_gpu(prevPyr_[level], nextPyr_[level],
+            pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
                level, block, patch);
        }
        else
        {
-            lkSparse4_gpu(prevPyr_[level], nextPyr_[level],
+            pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
                level, block, patch);
        }
@ -176,8 +171,6 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next

 void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err)
 {
-    using namespace cv::gpu::device::pyrlk;
-
    CV_Assert(prevImg.type() == CV_8UC1);
    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
    CV_Assert(maxLevel >= 0);
@ -211,7 +204,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
    vPyr_[1].setTo(Scalar::all(0));

    int2 winSize2i = make_int2(winSize.width, winSize.height);
-    loadConstants(winSize2i, iters);
+    pyrlk::loadConstants(winSize2i, iters);

    PtrStepSzf derr = err ? *err : PtrStepSzf();

@ -221,7 +214,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
    {
        int idx2 = (idx + 1) & 1;

-        lkDense_gpu(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
+        pyrlk::dense(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
            level == 0 ? derr : PtrStepSzf(), winSize2i);

        if (level > 0)
--- a/modules/gpu/src/remap.cpp
+++ b/modules/gpu/src/remap.cpp
@ -54,7 +54,7 @@ namespace cv { namespace gpu { namespace device
    {
        template <typename T>
        void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst,
-                       int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+                       int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
    }
 }}}

@ -63,7 +63,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
    using namespace cv::gpu::device::imgproc;

    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation,
-        int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

    static const func_t funcs[6][4] =
    {
@ -91,15 +91,12 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
    Scalar_<float> borderValueFloat;
    borderValueFloat = borderValue;

-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-
    Size wholeSize;
    Point ofs;
    src.locateROI(wholeSize, ofs);

    func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, xmap, ymap,
-        dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream), cc);
+        dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
 }

 #endif // HAVE_CUDA
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
@ -78,7 +78,7 @@ namespace

        if (depth == CV_64F)
        {
-            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            if (!deviceSupports(NATIVE_DOUBLE))
                CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
        }

@ -122,7 +122,7 @@ namespace

        if (depth == CV_64F)
        {
-            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            if (!deviceSupports(NATIVE_DOUBLE))
                CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
        }

--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@ -86,8 +86,7 @@ namespace cv { namespace gpu { namespace device

        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);

-        void compute_descriptors_gpu(const PtrStepSzf& descriptors,
-            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
+        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
    }
 }}}

@ -122,9 +121,6 @@ namespace
            CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
            CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);

-            if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-                CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
            const int min_size = calcSize(surf_.nOctaves - 1, 0);
            CV_Assert(img_rows - min_size >= 0);
            CV_Assert(img_cols - min_size >= 0);
--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp
@ -61,13 +61,13 @@ namespace cv { namespace gpu { namespace device

        template <typename T>
        void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-                            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);

        template <typename T>
        void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
-                            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
    }
 }}}

@ -143,33 +143,31 @@ namespace
    {
        typedef typename NppWarpFunc<DEPTH>::npp_t npp_t;

-        static void call(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst,
-                         double coeffs[][3], cv::Size dsize, int interpolation, cudaStream_t stream)
+        static void call(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream)
        {
            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};

-            dst.create(dsize, src.type());
-            dst.setTo(cv::Scalar::all(0));
-
            NppiSize srcsz;
-            srcsz.height = wholeSize.height;
-            srcsz.width = wholeSize.width;
+            srcsz.height = src.rows;
+            srcsz.width = src.cols;

            NppiRect srcroi;
-            srcroi.x = ofs.x;
-            srcroi.y = ofs.y;
+            srcroi.x = 0;
+            srcroi.y = 0;
            srcroi.height = src.rows;
            srcroi.width = src.cols;

            NppiRect dstroi;
-            dstroi.x = dstroi.y = 0;
+            dstroi.x = 0;
+            dstroi.y = 0;
            dstroi.height = dst.rows;
            dstroi.width = dst.cols;

            cv::gpu::NppStreamHandler h(stream);

-            nppSafeCall( func((npp_t*)src.datastart, srcsz, static_cast<int>(src.step), srcroi,
-                              dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi, coeffs, npp_inter[interpolation]) );
+            nppSafeCall( func(src.ptr<npp_t>(), srcsz, static_cast<int>(src.step), srcroi,
+                              dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi,
+                              coeffs, npp_inter[interpolation]) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@ -187,6 +185,8 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);

+    dst.create(dsize, src.type());
+
    Size wholeSize;
    Point ofs;
    src.locateROI(wholeSize, ofs);
@ -231,8 +231,7 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
        }
    };

-    bool useNpp = borderMode == BORDER_CONSTANT;
-    useNpp = useNpp && useNppTab[src.depth()][src.channels() - 1][interpolation];
+    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
    #ifdef linux
        // NPP bug on float data
        useNpp = useNpp && src.depth() != CV_32F;
@ -240,7 +239,7 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz

    if (useNpp)
    {
-        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst, double coeffs[][3], cv::Size dsize, int flags, cudaStream_t stream);
+        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);

        static const func_t funcs[2][6][4] =
        {
@ -262,6 +261,8 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
            }
        };

+        dst.setTo(borderValue);
+
        double coeffs[2][3];
        Mat coeffsMat(2, 3, CV_64F, (void*)coeffs);
        M.convertTo(coeffsMat, coeffsMat.type());
@ -269,14 +270,14 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

-        func(src, wholeSize, ofs, dst, coeffs, dsize, interpolation, StreamAccessor::getStream(s));
+        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
    }
    else
    {
        using namespace cv::gpu::device::imgproc;

        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

        static const func_t funcs[6][4] =
        {
@ -294,8 +295,6 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
        int gpuBorderType;
        CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));

-        dst.create(dsize, src.type());
-
        float coeffs[2 * 3];
        Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);

@ -311,11 +310,8 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
        Scalar_<float> borderValueFloat;
        borderValueFloat = borderValue;

-        DeviceInfo info;
-        int cc = info.majorVersion() * 10 + info.minorVersion();
-
        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
-            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc);
+            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
    }
 }

@ -329,6 +325,8 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);

+    dst.create(dsize, src.type());
+
    Size wholeSize;
    Point ofs;
    src.locateROI(wholeSize, ofs);
@ -373,8 +371,7 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
        }
    };

-    bool useNpp = borderMode == BORDER_CONSTANT;
-    useNpp = useNpp && useNppTab[src.depth()][src.channels() - 1][interpolation];
+    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
    #ifdef linux
        // NPP bug on float data
        useNpp = useNpp && src.depth() != CV_32F;
@ -382,7 +379,7 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size

    if (useNpp)
    {
-        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst, double coeffs[][3], cv::Size dsize, int flags, cudaStream_t stream);
+        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);

        static const func_t funcs[2][6][4] =
        {
@ -404,6 +401,8 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
            }
        };

+        dst.setTo(borderValue);
+
        double coeffs[3][3];
        Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
        M.convertTo(coeffsMat, coeffsMat.type());
@ -411,14 +410,14 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

-        func(src, wholeSize, ofs, dst, coeffs, dsize, interpolation, StreamAccessor::getStream(s));
+        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
    }
    else
    {
        using namespace cv::gpu::device::imgproc;

        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

        static const func_t funcs[6][4] =
        {
@ -436,8 +435,6 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
        int gpuBorderType;
        CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));

-        dst.create(dsize, src.type());
-
        float coeffs[3 * 3];
        Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);

@ -453,11 +450,8 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
        Scalar_<float> borderValueFloat;
        borderValueFloat = borderValue;

-        DeviceInfo info;
-        int cc = info.majorVersion() * 10 + info.minorVersion();
-
        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
-            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc);
+            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
    }
 }