diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 3c7dcc7d71..642deeeaad 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -756,12 +756,6 @@ namespace cv
         //! computes the proximity map for the raster template and the image where the template is searched for
         CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method);
 
-        //! downsamples image
-        CV_EXPORTS void downsample(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
-
-        //! upsamples image
-        CV_EXPORTS void upsample(const GpuMat& src, GpuMat &dst, Stream& stream = Stream::Null());
-
         //! smoothes the source image and downsamples it
         CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
 
diff --git a/modules/gpu/perf/perf_filters.cpp b/modules/gpu/perf/perf_filters.cpp
index 76177a060a..d40f070016 100644
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@@ -3,7 +3,7 @@
 PERF_TEST_P(DevInfo_Size_MatType_KernelSize, boxFilter, testing::Combine(testing::ValuesIn(devices()), 
                                                               testing::Values(GPU_TYPICAL_MAT_SIZES), 
                                                               testing::Values(CV_8UC1, CV_8UC4),
-                                                              testing::Values(3, 5, 7)))
+                                                              testing::Values(3, 5)))
 {
     DeviceInfo devInfo = std::tr1::get<0>(GetParam());
     Size size = std::tr1::get<1>(GetParam());
@@ -37,7 +37,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing::
                                                                                         testing::Values(GPU_TYPICAL_MAT_SIZES), 
                                                                                         testing::Values(CV_8UC1, CV_8UC4),
                                                                                         testing::Values((int)MORPH_ERODE, (int)MORPH_DILATE),
-                                                                                        testing::Values(3, 5, 7)))
+                                                                                        testing::Values(3, 5)))
 {
     DeviceInfo devInfo = std::tr1::get<0>(GetParam());
     Size size = std::tr1::get<1>(GetParam());
@@ -71,7 +71,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing::
 PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(testing::ValuesIn(devices()), 
                                                                             testing::Values(GPU_TYPICAL_MAT_SIZES), 
                                                                             testing::Values(CV_8UC1, CV_8UC4),
-                                                                            testing::Values(3, 5, 7)))
+                                                                            testing::Values(3, 5)))
 {
     DeviceInfo devInfo = std::tr1::get<0>(GetParam());
     Size size = std::tr1::get<1>(GetParam());
@@ -103,8 +103,8 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(test
 
 PERF_TEST_P(DevInfo_Size_MatType_KernelSize_BorderMode, separableLinearFilter, testing::Combine(testing::ValuesIn(devices()), 
                                                                                testing::Values(GPU_TYPICAL_MAT_SIZES), 
-                                                                               testing::Values(CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC3, CV_32FC1),
-                                                                               testing::Values(3, 5, 7),
+                                                                               testing::Values(CV_8UC1, CV_8UC4, CV_16SC3, CV_32FC1),
+                                                                               testing::Values(3, 5),
                                                                                testing::Values((int)BORDER_REFLECT101, (int)BORDER_CONSTANT)))
 {
     DeviceInfo devInfo = std::tr1::get<0>(GetParam());
diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp
index f4d63d6dd6..fb72043da7 100644
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -244,8 +244,8 @@ PERF_TEST_P(DevInfo_Size_MatType, threshold, testing::Combine(testing::ValuesIn(
 }
 
 PERF_TEST_P(DevInfo_Size_MatType_Interpolation_SizeCoeff, resize, testing::Combine(testing::ValuesIn(devices()),
-                                                                                   testing::Values(GPU_TYPICAL_MAT_SIZES), 
-                                                                                   testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4),
+                                                                                   testing::Values(szSXGA, sz1080p), 
+                                                                                   testing::Values(CV_8UC1, CV_8UC4, CV_16UC1, CV_32FC1),
                                                                                    testing::Values((int)INTER_NEAREST, (int)INTER_LINEAR, (int)INTER_CUBIC),
                                                                                    testing::Values(0.5, 2.0)))
 {
diff --git a/modules/gpu/perf/perf_utility.hpp b/modules/gpu/perf/perf_utility.hpp
index 883bd4a392..ec6b052fe7 100644
--- a/modules/gpu/perf/perf_utility.hpp
+++ b/modules/gpu/perf/perf_utility.hpp
@@ -53,7 +53,8 @@ typedef TestBaseWithParam< std::tr1::tuple<DeviceInfo, int, int> > DevInfo_K_Des
 const cv::Size sz1800x1500 = cv::Size(1800, 1500);
 const cv::Size sz4700x3000 = cv::Size(4700, 3000);
 
-#define GPU_TYPICAL_MAT_SIZES szXGA, szSXGA, sz720p, sz1080p, sz1800x1500, sz4700x3000
+//#define GPU_TYPICAL_MAT_SIZES szXGA, szSXGA, sz720p, sz1080p, sz1800x1500, sz4700x3000
+#define GPU_TYPICAL_MAT_SIZES szSXGA, sz1080p, sz4700x3000
 
 //! read image from testdata folder.
 Mat readImage(const string& fileName, int flags = CV_LOAD_IMAGE_COLOR);
diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp
index ab24cb0f7e..c56dcda0df 100644
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -179,18 +179,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
     static const match_caller_t match_callers[3][8] =
     {
         {
-            matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<signed char>, 
+            matchSingleL1_gpu<unsigned char>, 0/*matchSingleL1_gpu<signed char>*/, 
             matchSingleL1_gpu<unsigned short>, matchSingleL1_gpu<short>, 
             matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0
         },
         {
-            matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<signed char>, 
-            matchSingleL2_gpu<unsigned short>, matchSingleL2_gpu<short>, 
-            matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0
+            0/*matchSingleL2_gpu<unsigned char>*/, 0/*matchSingleL2_gpu<signed char>*/, 
+            0/*matchSingleL2_gpu<unsigned short>*/, 0/*matchSingleL2_gpu<short>*/, 
+            0/*matchSingleL2_gpu<int>*/, matchSingleL2_gpu<float>, 0, 0
         },
         {
-            matchSingleHamming_gpu<unsigned char>, matchSingleHamming_gpu<signed char>, 
-            matchSingleHamming_gpu<unsigned short>, matchSingleHamming_gpu<short>, 
+            matchSingleHamming_gpu<unsigned char>, 0/*matchSingleHamming_gpu<signed char>*/, 
+            matchSingleHamming_gpu<unsigned short>, 0/*matchSingleHamming_gpu<short>*/, 
             matchSingleHamming_gpu<int>, 0, 0, 0
         }
     };
@@ -318,18 +318,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
     static const match_caller_t match_callers[3][8] =
     {
         {
-            matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<signed char>,
+            matchCollectionL1_gpu<unsigned char>, 0/*matchCollectionL1_gpu<signed char>*/,
             matchCollectionL1_gpu<unsigned short>, matchCollectionL1_gpu<short>,
             matchCollectionL1_gpu<int>, matchCollectionL1_gpu<float>, 0, 0
         },
         {
-            matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<signed char>,
-            matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>,
-            matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0
+            0/*matchCollectionL2_gpu<unsigned char>*/, 0/*matchCollectionL2_gpu<signed char>*/,
+            0/*matchCollectionL2_gpu<unsigned short>*/, 0/*matchCollectionL2_gpu<short>*/,
+            0/*matchCollectionL2_gpu<int>*/, matchCollectionL2_gpu<float>, 0, 0
         },
         {
-            matchCollectionHamming_gpu<unsigned char>, matchCollectionHamming_gpu<signed char>,
-            matchCollectionHamming_gpu<unsigned short>, matchCollectionHamming_gpu<short>,
+            matchCollectionHamming_gpu<unsigned char>, 0/*matchCollectionHamming_gpu<signed char>*/,
+            matchCollectionHamming_gpu<unsigned short>, 0/*matchCollectionHamming_gpu<short>*/,
             matchCollectionHamming_gpu<int>, 0, 0, 0
         }
     };
@@ -427,16 +427,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
     static const match_caller_t match_callers[3][8] =
     {
         {
-            knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<signed char>, knnMatchL1_gpu<unsigned short>,
+            knnMatchL1_gpu<unsigned char>, 0/*knnMatchL1_gpu<signed char>*/, knnMatchL1_gpu<unsigned short>,
             knnMatchL1_gpu<short>, knnMatchL1_gpu<int>, knnMatchL1_gpu<float>, 0, 0
         },
         {
-            knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<signed char>, knnMatchL2_gpu<unsigned short>,
-            knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0
+            0/*knnMatchL2_gpu<unsigned char>*/, 0/*knnMatchL2_gpu<signed char>*/, 0/*knnMatchL2_gpu<unsigned short>*/,
+            0/*knnMatchL2_gpu<short>*/, 0/*knnMatchL2_gpu<int>*/, knnMatchL2_gpu<float>, 0, 0
         },
         {
-            knnMatchHamming_gpu<unsigned char>, knnMatchHamming_gpu<signed char>, knnMatchHamming_gpu<unsigned short>,
-            knnMatchHamming_gpu<short>, knnMatchHamming_gpu<int>, 0, 0, 0
+            knnMatchHamming_gpu<unsigned char>, 0/*knnMatchHamming_gpu<signed char>*/, knnMatchHamming_gpu<unsigned short>,
+            0/*knnMatchHamming_gpu<short>*/, knnMatchHamming_gpu<int>, 0, 0, 0
         }
     };
 
@@ -605,16 +605,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
     static const radiusMatch_caller_t radiusMatch_callers[3][8] =
     {
         {
-            radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<signed char>, radiusMatchL1_gpu<unsigned short>,
+            radiusMatchL1_gpu<unsigned char>, 0/*radiusMatchL1_gpu<signed char>*/, radiusMatchL1_gpu<unsigned short>,
             radiusMatchL1_gpu<short>, radiusMatchL1_gpu<int>, radiusMatchL1_gpu<float>, 0, 0
         },
         {
-            radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<signed char>, radiusMatchL2_gpu<unsigned short>,
-            radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0
+            0/*radiusMatchL2_gpu<unsigned char>*/, 0/*radiusMatchL2_gpu<signed char>*/, 0/*radiusMatchL2_gpu<unsigned short>*/,
+            0/*radiusMatchL2_gpu<short>*/, 0/*radiusMatchL2_gpu<int>*/, radiusMatchL2_gpu<float>, 0, 0
         },
         {
-            radiusMatchHamming_gpu<unsigned char>, radiusMatchHamming_gpu<signed char>, radiusMatchHamming_gpu<unsigned short>,
-            radiusMatchHamming_gpu<short>, radiusMatchHamming_gpu<int>, 0, 0, 0
+            radiusMatchHamming_gpu<unsigned char>, 0/*radiusMatchHamming_gpu<signed char>*/, radiusMatchHamming_gpu<unsigned short>,
+            0/*radiusMatchHamming_gpu<short>*/, radiusMatchHamming_gpu<int>, 0, 0, 0
         }
     };
 
diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu
new file mode 100644
index 0000000000..079251e485
--- /dev/null
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@@ -0,0 +1,464 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/vec_distance.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace cv { namespace gpu { namespace bfmatcher
+{
+    template <typename VecDiff, typename Dist, typename T, typename Mask>
+    __device__ void distanceCalcLoop(const PtrStep_<T>& query, const DevMem2D_<T>& train, const Mask& m, int queryIdx,
+        typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2, 
+        typename Dist::result_type* smem)
+    {
+        const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
+        
+        typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y;
+        
+        distMin1 = numeric_limits<typename Dist::result_type>::max();
+        distMin2 = numeric_limits<typename Dist::result_type>::max();
+
+        bestTrainIdx1 = -1;
+        bestTrainIdx2 = -1;
+
+        for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)
+        {
+            if (m(queryIdx, trainIdx))
+            {
+                Dist dist;
+
+                const T* trainRow = train.ptr(trainIdx);
+                
+                vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x);
+
+                const typename Dist::result_type val = dist;
+
+                if (val < distMin1)
+                {
+                    distMin1 = val;
+                    bestTrainIdx1 = trainIdx;
+                }
+                else if (val < distMin2)
+                {
+                    distMin2 = val;
+                    bestTrainIdx2 = trainIdx;
+                }
+            }
+        }
+    }
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Mask>
+    __global__ void knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, int2* trainIdx, float2* distance)
+    {
+        typedef typename Dist::result_type result_type;
+        typedef typename Dist::value_type value_type;
+
+        __shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
+
+        const int queryIdx = blockIdx.x;
+
+        result_type distMin1;
+        result_type distMin2;
+
+        int bestTrainIdx1;
+        int bestTrainIdx2;
+
+        distanceCalcLoop<VecDiff, Dist>(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem);
+        __syncthreads();
+
+        volatile result_type* sdistMinRow = smem;
+        volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y);
+
+        if (threadIdx.x == 0)
+        {
+            sdistMinRow[threadIdx.y] = distMin1;
+            sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2;
+
+            sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1;            
+            sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2;
+        }
+        __syncthreads();
+
+        if (threadIdx.x == 0 && threadIdx.y == 0)
+        {
+            distMin1 = numeric_limits<result_type>::max();
+            distMin2 = numeric_limits<result_type>::max();
+
+            bestTrainIdx1 = -1;
+            bestTrainIdx2 = -1;
+
+            #pragma unroll
+            for (int i = 0; i < BLOCK_DIM_Y; ++i)
+            {
+                result_type val = sdistMinRow[i];
+
+                if (val < distMin1)
+                {
+                    distMin1 = val;
+                    bestTrainIdx1 = sbestTrainIdxRow[i];
+                }
+                else if (val < distMin2)
+                {
+                    distMin2 = val;
+                    bestTrainIdx2 = sbestTrainIdxRow[i];
+                }
+            }
+
+            #pragma unroll
+            for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i)
+            {
+                result_type val = sdistMinRow[i];
+
+                if (val < distMin2)
+                {
+                    distMin2 = val;
+                    bestTrainIdx2 = sbestTrainIdxRow[i];
+                }
+            }
+
+            trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2);
+            distance[queryIdx] = make_float2(distMin1, distMin2);
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Knn 2 Match kernel caller
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
+    void knnMatch2Simple_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
+        const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
+        cudaStream_t stream)
+    {
+        const dim3 grid(query.rows, 1, 1);
+        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+        knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
+            <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx, distance);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask>
+    void knnMatch2Cached_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
+        const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
+        cudaStream_t stream)
+    {
+        StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
+        StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check();           // max descriptors length must divide to blockDimX
+
+        const dim3 grid(query.rows, 1, 1);
+        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+        knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
+              <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, distance.data);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Knn 2 Match Dispatcher
+    
+    template <typename Dist, typename T, typename Mask>
+    void knnMatch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream)
+    {
+        if (query.cols < 64)
+        {
+            knnMatch2Cached_caller<16, 16, 64, false, Dist>(
+                query, train, mask, 
+                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
+                stream);
+        }
+        else if (query.cols == 64)
+        {
+            knnMatch2Cached_caller<16, 16, 64, true, Dist>(
+                query, train, mask, 
+                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
+                stream);
+        }
+        else if (query.cols < 128)
+        {
+            knnMatch2Cached_caller<16, 16, 128, false, Dist>(
+                query, train, mask, 
+                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
+                stream);
+        }
+        else if (query.cols == 128 && cc >= 12)
+        {
+            knnMatch2Cached_caller<16, 16, 128, true, Dist>(
+                query, train, mask, 
+                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
+                stream);
+        }
+        else if (query.cols < 256 && cc >= 12)
+        {
+            knnMatch2Cached_caller<16, 16, 256, false, Dist>(
+                query, train, mask, 
+                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
+                stream);
+        }
+        else if (query.cols == 256 && cc >= 12)
+        {
+            knnMatch2Cached_caller<16, 16, 256, true, Dist>(
+                query, train, mask, 
+                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
+                stream);
+        }
+        else
+        {
+            knnMatch2Simple_caller<16, 16, Dist>(
+                query, train, mask, 
+                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
+                stream);
+        }
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // Calc distance kernel
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
+    __global__ void calcDistance(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf distance)
+    {
+        __shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
+
+        typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
+        
+        const int queryIdx = blockIdx.x;
+        const T* queryDescs = query.ptr(queryIdx);
+
+        const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+
+        if (trainIdx < train.rows)
+        {
+            const T* trainDescs = train.ptr(trainIdx);
+
+            typename Dist::result_type myDist = numeric_limits<typename Dist::result_type>::max();
+
+            if (mask(queryIdx, trainIdx))
+            {
+                Dist dist;
+
+                calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
+
+                myDist = dist;
+            }
+            
+            if (threadIdx.x == 0)
+                distance.ptr(queryIdx)[trainIdx] = myDist;
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Calc distance kernel caller
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
+    void calcDistance_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream)
+    {
+        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+        const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
+
+        calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, mask, distance);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename Dist, typename T, typename Mask>
+    void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream)
+    {
+        calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast<DevMem2Df>(allDist), stream);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // find knn match kernel
+
+    template <int BLOCK_SIZE> __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
+    {
+        const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
+        __shared__ float sdist[SMEM_SIZE];
+        __shared__ int strainIdx[SMEM_SIZE];
+
+        const int queryIdx = blockIdx.x;
+
+        float* allDist = allDist_.ptr(queryIdx);
+        int* trainIdx = trainIdx_.ptr(queryIdx);
+        float* distance = distance_.ptr(queryIdx);
+
+        float dist = numeric_limits<float>::max();
+        int bestIdx = -1;
+        
+        for (int i = threadIdx.x; i < allDist_.cols; i += BLOCK_SIZE)
+        {
+            float reg = allDist[i];
+            if (reg < dist)
+            {
+                dist = reg;
+                bestIdx = i;
+            }
+        }
+
+        sdist[threadIdx.x] = dist;
+        strainIdx[threadIdx.x] = bestIdx;
+        __syncthreads();
+
+        reducePredVal<BLOCK_SIZE>(sdist, dist, strainIdx, bestIdx, threadIdx.x, less<volatile float>());
+
+        if (threadIdx.x == 0)
+        {
+            if (dist < numeric_limits<float>::max())
+            {
+                allDist[bestIdx] = numeric_limits<float>::max();
+                trainIdx[i] = bestIdx;
+                distance[i] = dist;
+            }
+        }
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // find knn match kernel caller
+
+    template <int BLOCK_SIZE> void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
+    {
+        const dim3 threads(BLOCK_SIZE, 1, 1);
+        const dim3 grid(trainIdx.rows, 1, 1);
+
+        for (int i = 0; i < k; ++i)
+        {
+            findBestMatch<BLOCK_SIZE><<<grid, threads, 0, stream>>>(allDist, i, trainIdx, distance);
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, cudaStream_t stream)
+    {
+        findKnnMatch_caller<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), static_cast<DevMem2Df>(allDist), stream);
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // knn match Dispatcher
+
+    template <typename Dist, typename T>
+    void knnMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, 
+        int cc, cudaStream_t stream)
+    {
+        if (mask.data)
+        {
+            if (k == 2)
+            {
+                knnMatch2Dispatcher<Dist>(query, train, SingleMask(mask), trainIdx, distance, cc, stream);
+                return;
+            }
+
+            calcDistanceDispatcher<Dist>(query, train, SingleMask(mask), allDist, stream);
+        }
+        else
+        {
+            if (k == 2)
+            {
+                knnMatch2Dispatcher<Dist>(query, train, WithOutMask(), trainIdx, distance, cc, stream);
+                return;
+            }
+
+            calcDistanceDispatcher<Dist>(query, train, WithOutMask(), allDist, stream);
+        }
+
+        findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // knn match caller
+
+    template <typename T> void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, 
+        int cc, cudaStream_t stream)
+    {
+        knnMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
+    }
+
+    template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    //template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+
+    template <typename T> void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
+        int cc, cudaStream_t stream)
+    {
+        knnMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
+    }
+
+    //template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    //template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    //template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    //template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    //template void knnMatchL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+
+    template <typename T> void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
+        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, 
+        int cc, cudaStream_t stream)
+    {
+        knnMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
+    }
+
+    template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    //template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    //template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+    template void knnMatchHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
+}}}
diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu
new file mode 100644
index 0000000000..83f48b4d3c
--- /dev/null
+++ b/modules/gpu/src/cuda/bf_match.cu
@@ -0,0 +1,403 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/vec_distance.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace cv { namespace gpu { namespace bfmatcher
+{
+    template <int BLOCK_DIM_Y, typename T>
+    __device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx)
+    {
+        if (threadIdx.x == 0)
+        {
+            smin[threadIdx.y] = myDist;
+            sIdx[threadIdx.y] = myIdx;
+        }
+        __syncthreads();
+
+        reducePredVal<BLOCK_DIM_Y>(smin, myDist, sIdx, myIdx, threadIdx.y * blockDim.x + threadIdx.x, less<volatile T>());
+    }
+
+    template <typename Dist, typename VecDiff, typename T, typename Mask>
+    __device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& train, const Mask& m, const VecDiff& vecDiff,
+        typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row)
+    {
+        for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)
+        {
+            if (m(queryIdx, trainIdx))
+            {
+                const T* trainDescs = train.ptr(trainIdx);
+
+                Dist dist;
+
+                vecDiff.calc(trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
+
+                const typename Dist::result_type res = dist;
+
+                if (res < myDist)
+                {
+                    myDist = res;
+                    myIdx.x = trainIdx;
+                    myIdx.y = imgIdx;
+                }
+            }
+        }
+    }
+
+    template <typename T> struct SingleTrain
+    {
+        explicit SingleTrain(const DevMem2D_<T>& train_) : train(train_)
+        {
+        }
+
+        template <typename Dist, typename VecDiff, typename Mask>
+        __device__ __forceinline__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, 
+            typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const
+        {
+            matchDescs<Dist>(queryIdx, 0, train, m, vecDiff, myDist, myIdx, sdiff_row);
+        }
+
+        __device__ __forceinline__ int desc_len() const
+        {
+            return train.cols;
+        }
+
+        static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, 
+            float myDist, const int2& myIdx, int queryIdx)
+        {
+            trainIdx[queryIdx] = myIdx.x;
+            distance[queryIdx] = myDist;
+        }
+
+        const DevMem2D_<T> train;
+    };
+
+    template <typename T> struct TrainCollection
+    {
+        TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) : 
+            trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)
+        {
+        }
+
+        template <typename Dist, typename VecDiff, typename Mask>
+        __device__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, 
+            typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const
+        {
+            for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
+            {
+                const DevMem2D_<T> train = trainCollection[imgIdx];
+                m.next();
+                matchDescs<Dist>(queryIdx, imgIdx, train, m, vecDiff, myDist, myIdx, sdiff_row);
+            }
+        }
+
+        __device__ __forceinline__ int desc_len() const
+        {
+            return desclen;
+        }
+
+        static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, 
+            float myDist, const int2& myIdx, int queryIdx)
+        {
+            trainIdx[queryIdx] = myIdx.x;
+            imgIdx[queryIdx] = myIdx.y;
+            distance[queryIdx] = myDist;
+        }
+
+        const DevMem2D_<T>* trainCollection;
+        const int nImg;
+        const int desclen;
+    };
+
+    template <typename VecDiff, typename Dist, typename T, typename Train, typename Mask>
+    __device__ void distanceCalcLoop(const PtrStep_<T>& query, const Train& train, const Mask& mask, int queryIdx, 
+        typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* smem)
+    {
+        const VecDiff vecDiff(query.ptr(queryIdx), train.desc_len(), (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
+    
+        typename Dist::result_type* sdiff_row = smem + blockDim.x * threadIdx.y;
+
+        Mask m = mask;
+
+        myIdx.x = -1;
+        myIdx.y = -1;
+        myDist = numeric_limits<typename Dist::result_type>::max();
+
+        train.template loop<Dist>(queryIdx, m, vecDiff, myDist, myIdx, sdiff_row);
+    }
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Train, typename Mask>
+    __global__ void match(const PtrStep_<T> query, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance)
+    {
+        __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];        
+        
+        const int queryIdx = blockIdx.x;
+        
+        int2 myIdx;
+        typename Dist::result_type myDist;
+
+        distanceCalcLoop<VecDiff, Dist>(query, train, mask, queryIdx, myDist, myIdx, smem);
+        __syncthreads();
+
+        typename Dist::result_type* smin = smem;
+        int2* sIdx = (int2*)(smin + BLOCK_DIM_Y);
+
+        findBestMatch<BLOCK_DIM_Y>(myDist, myIdx, smin, sIdx);
+
+        if (threadIdx.x == 0 && threadIdx.y == 0)
+            Train::storeResult(distance, trainIdx, imgIdx, myDist, myIdx, queryIdx);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Match kernel caller
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask>
+    void matchSimple_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask, 
+        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        cudaStream_t stream)
+    {
+        StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
+
+        const dim3 grid(query.rows, 1, 1);
+        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+        match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
+            <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>
+    void matchCached_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask, 
+        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        cudaStream_t stream)
+    {
+        StaticAssert<BLOCK_DIM_Y <= 64>::check();                    // blockDimY vals must reduce by warp
+        StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
+        StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check();           // max descriptors length must divide to blockDimX
+
+        const dim3 grid(query.rows, 1, 1);
+        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+        match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
+              <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // Match Dispatcher
+
+    template <typename Dist, typename T, typename Train, typename Mask>
+    void matchDispatcher(const DevMem2D_<T>& query, const Train& train, const Mask& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
+        int cc, cudaStream_t stream)
+    {
+        if (query.cols < 64)
+        {
+            matchCached_caller<16, 16, 64, false, Dist>(
+                query, train, mask, 
+                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
+                stream);
+        }
+        else if (query.cols == 64)
+        {
+            matchCached_caller<16, 16, 64, true, Dist>(
+                query, train, mask, 
+                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
+                stream);
+        }
+        else if (query.cols < 128)
+        {
+            matchCached_caller<16, 16, 128, false, Dist>(
+                query, train, mask, 
+                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
+                stream);
+        }
+        else if (query.cols == 128 && cc >= 12)
+        {
+            matchCached_caller<16, 16, 128, true, Dist>(
+                query, train, mask, 
+                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
+                stream);
+        }
+        else if (query.cols < 256 && cc >= 12)
+        {
+            matchCached_caller<16, 16, 256, false, Dist>(
+                query, train, mask, 
+                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
+                stream);
+        }
+        else if (query.cols == 256 && cc >= 12)
+        {
+            matchCached_caller<16, 16, 256, true, Dist>(
+                query, train, mask, 
+                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
+                stream);
+        }
+        else
+        {
+            matchSimple_caller<16, 16, Dist>(
+                query, train, mask, 
+                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
+                stream);
+        }
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // Match caller
+
+    template <typename T> void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance,
+        int cc, cudaStream_t stream)
+    {
+        SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
+        if (mask.data)
+            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
+        else
+            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
+    }
+
+    template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    //template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+
+    template <typename T> void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream)
+    {
+        SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
+        if (mask.data)
+            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
+        else
+            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
+    }
+
+    //template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    //template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    //template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    //template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    //template void matchSingleL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+
+    template <typename T> void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream)
+    {
+        SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
+        if (mask.data)
+            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
+        else
+            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
+    }
+
+    template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    //template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    //template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchSingleHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+
+    template <typename T> void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, 
+        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream)
+    {
+        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
+        if (maskCollection.data)
+            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
+        else
+            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+    }
+
+    template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    //template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+
+    template <typename T> void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, 
+        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream)
+    {
+        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
+        if (maskCollection.data)
+            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
+        else
+            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+    }
+
+    //template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    //template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    //template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    //template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    //template void matchCollectionL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+
+    template <typename T> void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, 
+        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, 
+        int cc, cudaStream_t stream)
+    {
+        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
+        if (maskCollection.data)
+            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
+        else
+            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+    }
+
+    template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    //template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    //template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+    template void matchCollectionHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
+}}}
diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu
new file mode 100644
index 0000000000..5dc8fabd9d
--- /dev/null
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -0,0 +1,202 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/vec_distance.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace cv { namespace gpu { namespace bfmatcher
+{
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
+    __global__ void radiusMatch(const PtrStep_<T> query, const DevMem2D_<T> train, float maxDistance, const Mask mask, 
+        DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
+    {
+        #if __CUDA_ARCH__ >= 110
+
+        __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
+
+        typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
+        
+        const int queryIdx = blockIdx.x;
+        const T* queryDescs = query.ptr(queryIdx);
+
+        const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+
+        if (trainIdx < train.rows)
+        {
+            const T* trainDescs = train.ptr(trainIdx);
+
+            if (mask(queryIdx, trainIdx))
+            {
+                Dist dist;
+
+                calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
+
+                if (threadIdx.x == 0)
+                {
+                    if (dist < maxDistance)
+                    {
+                        unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+                        if (i < trainIdx_.cols)
+                        {
+                            distance.ptr(queryIdx)[i] = dist;
+                            trainIdx_.ptr(queryIdx)[i] = trainIdx;
+                        }
+                    }
+                }
+            }
+        }
+
+        #endif
+    }
+        
+    ///////////////////////////////////////////////////////////////////////////////
+    // Radius Match kernel caller
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
+    void radiusMatch_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
+        const DevMem2Di& trainIdx, const DevMem2D_<unsigned int>& nMatches, const DevMem2Df& distance, 
+        cudaStream_t stream)
+    {
+        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+        const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
+
+        radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // Radius Match Dispatcher
+
+    template <typename Dist, typename T, typename Mask>
+    void radiusMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
+        cudaStream_t stream)
+    {
+        radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask, 
+            static_cast<DevMem2Di>(trainIdx), static_cast< const DevMem2D_<unsigned int> >(nMatches), static_cast<DevMem2Df>(distance), 
+            stream);
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // Radius Match caller
+
+    template <typename T> void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
+        cudaStream_t stream)
+    {
+        if (mask.data)
+        {
+            radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+                trainIdx, nMatches, distance, 
+                stream);
+        }
+        else
+        {
+            radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+                trainIdx, nMatches, distance, 
+                stream);
+        }
+    }
+
+    template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    //template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+
+    template <typename T> void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
+        cudaStream_t stream)
+    {
+        if (mask.data)
+        {
+            radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+                trainIdx, nMatches, distance, 
+                stream);
+        }
+        else
+        {
+            radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+                trainIdx, nMatches, distance, 
+                stream);
+        }
+    }
+
+    //template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    //template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    //template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    //template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    //template void radiusMatchL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+
+    template <typename T> void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, 
+        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
+        cudaStream_t stream)
+    {
+        if (mask.data)
+        {
+            radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+                trainIdx, nMatches, distance, 
+                stream);
+        }
+        else
+        {
+            radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+                trainIdx, nMatches, distance, 
+                stream);
+        }
+    }
+
+    template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    //template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    //template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+    template void radiusMatchHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
+}}}
diff --git a/modules/gpu/src/cuda/bilateral_filter.cu b/modules/gpu/src/cuda/bilateral_filter.cu
new file mode 100644
index 0000000000..ae5d801486
--- /dev/null
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -0,0 +1,233 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace bf_krnls
+{
+    __constant__ float* ctable_color;
+    __constant__ float* ctable_space;
+    __constant__ size_t ctable_space_step;
+
+    __constant__ int cndisp;
+    __constant__ int cradius;
+
+    __constant__ short cedge_disc;
+    __constant__ short cmax_disc;
+}
+
+namespace cv { namespace gpu { namespace bf 
+{
+    void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
+        size_t table_space_step = table_space.step / sizeof(float);
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
+
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
+
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
+    }
+}}}
+
+namespace bf_krnls
+{
+    template <int channels>
+    struct DistRgbMax
+    {
+        static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+        {
+            uchar x = abs(a[0] - b[0]);
+            uchar y = abs(a[1] - b[1]);
+            uchar z = abs(a[2] - b[2]);
+            return (max(max(x, y), z));
+        }
+    };
+
+    template <>
+    struct DistRgbMax<1>
+    {
+        static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+        {
+            return abs(a[0] - b[0]);
+        }
+    };
+
+    template <int channels, typename T>
+    __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+
+        T dp[5];
+
+        if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
+        {
+            dp[0] = *(disp + (y  ) * disp_step + x + 0);
+            dp[1] = *(disp + (y-1) * disp_step + x + 0);
+            dp[2] = *(disp + (y  ) * disp_step + x - 1);
+            dp[3] = *(disp + (y+1) * disp_step + x + 0);
+            dp[4] = *(disp + (y  ) * disp_step + x + 1);
+
+            if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc)            
+            {
+                const int ymin = max(0, y - cradius);
+                const int xmin = max(0, x - cradius);
+                const int ymax = min(h - 1, y + cradius);
+                const int xmax = min(w - 1, x + cradius);
+
+                float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+
+                const uchar* ic = img + y * img_step + channels * x;
+
+                for(int yi = ymin; yi <= ymax; yi++)
+                {
+                    const T* disp_y = disp + yi * disp_step;
+
+                    for(int xi = xmin; xi <= xmax; xi++)
+                    {
+                        const uchar* in = img + yi * img_step + channels * xi;
+
+                        uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
+
+                        const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)];
+
+                        const T disp_reg = disp_y[xi];
+
+                        cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight;
+                        cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight;
+                        cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight;
+                        cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight;
+                        cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight;
+                    }
+                }
+
+                float minimum = numeric_limits<float>::max();
+                int id = 0;
+
+                if (cost[0] < minimum)
+                {
+                    minimum = cost[0];
+                    id = 0;
+                }
+                if (cost[1] < minimum)
+                {
+                    minimum = cost[1];
+                    id = 1;
+                }
+                if (cost[2] < minimum)
+                {
+                    minimum = cost[2];
+                    id = 2;
+                }
+                if (cost[3] < minimum)
+                {
+                    minimum = cost[3];
+                    id = 3;
+                }
+                if (cost[4] < minimum)
+                {
+                    minimum = cost[4];
+                    id = 4;
+                }
+
+                *(disp + y * disp_step + x) = dp[id];
+            }
+        }
+    }
+}
+
+namespace cv { namespace gpu { namespace bf 
+{
+    template <typename T>     
+    void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
+    {
+        dim3 threads(32, 8, 1);
+        dim3 grid(1, 1, 1);
+        grid.x = divUp(disp.cols, threads.x << 1);
+        grid.y = divUp(disp.rows, threads.y);
+
+        switch (channels)
+        {
+        case 1:
+            for (int i = 0; i < iters; ++i)
+            {
+                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                cudaSafeCall( cudaGetLastError() );
+                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                cudaSafeCall( cudaGetLastError() );
+            }
+            break;
+        case 3:
+            for (int i = 0; i < iters; ++i)
+            {
+                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                cudaSafeCall( cudaGetLastError() );
+                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                cudaSafeCall( cudaGetLastError() );
+            }
+            break;
+        default:
+            cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
+        }
+
+        if (stream != 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
+    {
+        bilateral_filter_caller(disp, img, channels, iters, stream);
+    }
+
+    void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
+    {
+        bilateral_filter_caller(disp, img, channels, iters, stream);
+    }
+}}}
diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu
deleted file mode 100644
index f5afda75b5..0000000000
--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ /dev/null
@@ -1,980 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/vec_distance.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::device;
-
-namespace cv { namespace gpu { namespace bfmatcher
-{
-
-///////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////// Match //////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////
-
-    template <int BLOCK_DIM_Y, typename T>
-    __device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx)
-    {
-        if (threadIdx.x == 0)
-        {
-            smin[threadIdx.y] = myDist;
-            sIdx[threadIdx.y] = myIdx;
-        }
-        __syncthreads();
-
-        reducePredVal<BLOCK_DIM_Y>(smin, myDist, sIdx, myIdx, threadIdx.y * blockDim.x + threadIdx.x, less<volatile T>());
-    }
-
-    template <typename Dist, typename VecDiff, typename T, typename Mask>
-    __device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& train, const Mask& m, const VecDiff& vecDiff,
-        typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row)
-    {
-        for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)
-        {
-            if (m(queryIdx, trainIdx))
-            {
-                const T* trainDescs = train.ptr(trainIdx);
-
-                Dist dist;
-
-                vecDiff.calc(trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
-
-                const typename Dist::result_type res = dist;
-
-                if (res < myDist)
-                {
-                    myDist = res;
-                    myIdx.x = trainIdx;
-                    myIdx.y = imgIdx;
-                }
-            }
-        }
-    }
-
-    template <typename T> struct SingleTrain
-    {
-        explicit SingleTrain(const DevMem2D_<T>& train_) : train(train_)
-        {
-        }
-
-        template <typename Dist, typename VecDiff, typename Mask>
-        __device__ __forceinline__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, 
-            typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const
-        {
-            matchDescs<Dist>(queryIdx, 0, train, m, vecDiff, myDist, myIdx, sdiff_row);
-        }
-
-        __device__ __forceinline__ int desc_len() const
-        {
-            return train.cols;
-        }
-
-        static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, 
-            float myDist, const int2& myIdx, int queryIdx)
-        {
-            trainIdx[queryIdx] = myIdx.x;
-            distance[queryIdx] = myDist;
-        }
-
-        const DevMem2D_<T> train;
-    };
-
-    template <typename T> struct TrainCollection
-    {
-        TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) : 
-            trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)
-        {
-        }
-
-        template <typename Dist, typename VecDiff, typename Mask>
-        __device__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, 
-            typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const
-        {
-            for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
-            {
-                const DevMem2D_<T> train = trainCollection[imgIdx];
-                m.next();
-                matchDescs<Dist>(queryIdx, imgIdx, train, m, vecDiff, myDist, myIdx, sdiff_row);
-            }
-        }
-
-        __device__ __forceinline__ int desc_len() const
-        {
-            return desclen;
-        }
-
-        static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, 
-            float myDist, const int2& myIdx, int queryIdx)
-        {
-            trainIdx[queryIdx] = myIdx.x;
-            imgIdx[queryIdx] = myIdx.y;
-            distance[queryIdx] = myDist;
-        }
-
-        const DevMem2D_<T>* trainCollection;
-        const int nImg;
-        const int desclen;
-    };
-
-    template <typename VecDiff, typename Dist, typename T, typename Train, typename Mask>
-    __device__ void distanceCalcLoop(const PtrStep_<T>& query, const Train& train, const Mask& mask, int queryIdx, 
-        typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* smem)
-    {
-        const VecDiff vecDiff(query.ptr(queryIdx), train.desc_len(), (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
-    
-        typename Dist::result_type* sdiff_row = smem + blockDim.x * threadIdx.y;
-
-        Mask m = mask;
-
-        myIdx.x = -1;
-        myIdx.y = -1;
-        myDist = numeric_limits<typename Dist::result_type>::max();
-
-        train.template loop<Dist>(queryIdx, m, vecDiff, myDist, myIdx, sdiff_row);
-    }
-
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Train, typename Mask>
-    __global__ void match(const PtrStep_<T> query, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance)
-    {
-        __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];        
-        
-        const int queryIdx = blockIdx.x;
-        
-        int2 myIdx;
-        typename Dist::result_type myDist;
-
-        distanceCalcLoop<VecDiff, Dist>(query, train, mask, queryIdx, myDist, myIdx, smem);
-        __syncthreads();
-
-        typename Dist::result_type* smin = smem;
-        int2* sIdx = (int2*)(smin + BLOCK_DIM_Y);
-
-        findBestMatch<BLOCK_DIM_Y>(myDist, myIdx, smin, sIdx);
-
-        if (threadIdx.x == 0 && threadIdx.y == 0)
-            Train::storeResult(distance, trainIdx, imgIdx, myDist, myIdx, queryIdx);
-    }
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Match kernel caller
-
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask>
-    void matchSimple_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask, 
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-        cudaStream_t stream)
-    {
-        StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
-
-        const dim3 grid(query.rows, 1, 1);
-        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
-
-        match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
-            <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>
-    void matchCached_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask, 
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-        cudaStream_t stream)
-    {
-        StaticAssert<BLOCK_DIM_Y <= 64>::check();                    // blockDimY vals must reduce by warp
-        StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
-        StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check();           // max descriptors length must divide to blockDimX
-
-        const dim3 grid(query.rows, 1, 1);
-        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
-
-        match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
-              <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // Match Dispatcher
-
-    template <typename Dist, typename T, typename Train, typename Mask>
-    void matchDispatcher(const DevMem2D_<T>& query, const Train& train, const Mask& mask, 
-        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
-        int cc, cudaStream_t stream)
-    {
-        if (query.cols < 64)
-        {
-            matchCached_caller<16, 16, 64, false, Dist>(
-                query, train, mask, 
-                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
-                stream);
-        }
-        else if (query.cols == 64)
-        {
-            matchCached_caller<16, 16, 64, true, Dist>(
-                query, train, mask, 
-                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
-                stream);
-        }
-        else if (query.cols < 128)
-        {
-            matchCached_caller<16, 16, 128, false, Dist>(
-                query, train, mask, 
-                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
-                stream);
-        }
-        else if (query.cols == 128 && cc >= 12)
-        {
-            matchCached_caller<16, 16, 128, true, Dist>(
-                query, train, mask, 
-                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
-                stream);
-        }
-        else if (query.cols < 256 && cc >= 12)
-        {
-            matchCached_caller<16, 16, 256, false, Dist>(
-                query, train, mask, 
-                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
-                stream);
-        }
-        else if (query.cols == 256 && cc >= 12)
-        {
-            matchCached_caller<16, 16, 256, true, Dist>(
-                query, train, mask, 
-                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
-                stream);
-        }
-        else
-        {
-            matchSimple_caller<16, 16, Dist>(
-                query, train, mask, 
-                static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), 
-                stream);
-        }
-    }
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // Match caller
-
-    template <typename T> void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, 
-        const DevMem2D& trainIdx, const DevMem2D& distance,
-        int cc, cudaStream_t stream)
-    {
-        SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
-        if (mask.data)
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
-        else
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
-    }
-
-    template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchSingleL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-
-    template <typename T> void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, 
-        const DevMem2D& trainIdx, const DevMem2D& distance, 
-        int cc, cudaStream_t stream)
-    {
-        SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
-        if (mask.data)
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
-        else
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
-    }
-
-    template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchSingleL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-
-    template <typename T> void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, 
-        const DevMem2D& trainIdx, const DevMem2D& distance, 
-        int cc, cudaStream_t stream)
-    {
-        SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
-        if (mask.data)
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
-        else
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
-    }
-
-    template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchSingleHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-
-    template <typename T> void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, 
-        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, 
-        int cc, cudaStream_t stream)
-    {
-        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
-        if (maskCollection.data)
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
-        else
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
-    }
-
-    template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchCollectionL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-
-    template <typename T> void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, 
-        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, 
-        int cc, cudaStream_t stream)
-    {
-        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
-        if (maskCollection.data)
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
-        else
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
-    }
-
-    template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchCollectionL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-
-    template <typename T> void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, 
-        const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, 
-        int cc, cudaStream_t stream)
-    {
-        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
-        if (maskCollection.data)
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
-        else
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
-    }
-
-    template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    template void matchCollectionHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
-    
-///////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////// Knn Match ////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////
-
-    template <typename VecDiff, typename Dist, typename T, typename Mask>
-    __device__ void distanceCalcLoop(const PtrStep_<T>& query, const DevMem2D_<T>& train, const Mask& m, int queryIdx,
-        typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2, 
-        typename Dist::result_type* smem)
-    {
-        const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
-        
-        typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y;
-        
-        distMin1 = numeric_limits<typename Dist::result_type>::max();
-        distMin2 = numeric_limits<typename Dist::result_type>::max();
-
-        bestTrainIdx1 = -1;
-        bestTrainIdx2 = -1;
-
-        for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)
-        {
-            if (m(queryIdx, trainIdx))
-            {
-                Dist dist;
-
-                const T* trainRow = train.ptr(trainIdx);
-                
-                vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x);
-
-                const typename Dist::result_type val = dist;
-
-                if (val < distMin1)
-                {
-                    distMin1 = val;
-                    bestTrainIdx1 = trainIdx;
-                }
-                else if (val < distMin2)
-                {
-                    distMin2 = val;
-                    bestTrainIdx2 = trainIdx;
-                }
-            }
-        }
-    }
-
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Mask>
-    __global__ void knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, int2* trainIdx, float2* distance)
-    {
-        typedef typename Dist::result_type result_type;
-        typedef typename Dist::value_type value_type;
-
-        __shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
-
-        const int queryIdx = blockIdx.x;
-
-        result_type distMin1;
-        result_type distMin2;
-
-        int bestTrainIdx1;
-        int bestTrainIdx2;
-
-        distanceCalcLoop<VecDiff, Dist>(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem);
-        __syncthreads();
-
-        volatile result_type* sdistMinRow = smem;
-        volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y);
-
-        if (threadIdx.x == 0)
-        {
-            sdistMinRow[threadIdx.y] = distMin1;
-            sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2;
-
-            sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1;            
-            sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2;
-        }
-        __syncthreads();
-
-        if (threadIdx.x == 0 && threadIdx.y == 0)
-        {
-            distMin1 = numeric_limits<result_type>::max();
-            distMin2 = numeric_limits<result_type>::max();
-
-            bestTrainIdx1 = -1;
-            bestTrainIdx2 = -1;
-
-            #pragma unroll
-            for (int i = 0; i < BLOCK_DIM_Y; ++i)
-            {
-                result_type val = sdistMinRow[i];
-
-                if (val < distMin1)
-                {
-                    distMin1 = val;
-                    bestTrainIdx1 = sbestTrainIdxRow[i];
-                }
-                else if (val < distMin2)
-                {
-                    distMin2 = val;
-                    bestTrainIdx2 = sbestTrainIdxRow[i];
-                }
-            }
-
-            #pragma unroll
-            for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i)
-            {
-                result_type val = sdistMinRow[i];
-
-                if (val < distMin2)
-                {
-                    distMin2 = val;
-                    bestTrainIdx2 = sbestTrainIdxRow[i];
-                }
-            }
-
-            trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2);
-            distance[queryIdx] = make_float2(distMin1, distMin2);
-        }
-    }
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Knn 2 Match kernel caller
-
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
-    void knnMatch2Simple_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-        const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
-        cudaStream_t stream)
-    {
-        const dim3 grid(query.rows, 1, 1);
-        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
-
-        knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
-            <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx, distance);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask>
-    void knnMatch2Cached_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-        const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
-        cudaStream_t stream)
-    {
-        StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
-        StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check();           // max descriptors length must divide to blockDimX
-
-        const dim3 grid(query.rows, 1, 1);
-        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
-
-        knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
-              <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, distance.data);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Knn 2 Match Dispatcher
-    
-    template <typename Dist, typename T, typename Mask>
-    void knnMatch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-        const DevMem2D& trainIdx, const DevMem2D& distance, 
-        int cc, cudaStream_t stream)
-    {
-        if (query.cols < 64)
-        {
-            knnMatch2Cached_caller<16, 16, 64, false, Dist>(
-                query, train, mask, 
-                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
-                stream);
-        }
-        else if (query.cols == 64)
-        {
-            knnMatch2Cached_caller<16, 16, 64, true, Dist>(
-                query, train, mask, 
-                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
-                stream);
-        }
-        else if (query.cols < 128)
-        {
-            knnMatch2Cached_caller<16, 16, 128, false, Dist>(
-                query, train, mask, 
-                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
-                stream);
-        }
-        else if (query.cols == 128 && cc >= 12)
-        {
-            knnMatch2Cached_caller<16, 16, 128, true, Dist>(
-                query, train, mask, 
-                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
-                stream);
-        }
-        else if (query.cols < 256 && cc >= 12)
-        {
-            knnMatch2Cached_caller<16, 16, 256, false, Dist>(
-                query, train, mask, 
-                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
-                stream);
-        }
-        else if (query.cols == 256 && cc >= 12)
-        {
-            knnMatch2Cached_caller<16, 16, 256, true, Dist>(
-                query, train, mask, 
-                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), 
-                stream);
-        }
-        else
-        {
-            knnMatch2Simple_caller<16, 16, Dist>(
-                query, train, mask, 
-                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
-                stream);
-        }
-    }
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // Calc distance kernel
-
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
-    __global__ void calcDistance(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf distance)
-    {
-        __shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
-
-        typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
-        
-        const int queryIdx = blockIdx.x;
-        const T* queryDescs = query.ptr(queryIdx);
-
-        const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
-
-        if (trainIdx < train.rows)
-        {
-            const T* trainDescs = train.ptr(trainIdx);
-
-            typename Dist::result_type myDist = numeric_limits<typename Dist::result_type>::max();
-
-            if (mask(queryIdx, trainIdx))
-            {
-                Dist dist;
-
-                calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
-
-                myDist = dist;
-            }
-            
-            if (threadIdx.x == 0)
-                distance.ptr(queryIdx)[trainIdx] = myDist;
-        }
-    }
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Calc distance kernel caller
-
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
-    void calcDistance_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream)
-    {
-        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
-        const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
-
-        calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, mask, distance);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <typename Dist, typename T, typename Mask>
-    void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream)
-    {
-        calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast<DevMem2Df>(allDist), stream);
-    }
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // find knn match kernel
-
-    template <int BLOCK_SIZE> __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
-    {
-        const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
-        __shared__ float sdist[SMEM_SIZE];
-        __shared__ int strainIdx[SMEM_SIZE];
-
-        const int queryIdx = blockIdx.x;
-
-        float* allDist = allDist_.ptr(queryIdx);
-        int* trainIdx = trainIdx_.ptr(queryIdx);
-        float* distance = distance_.ptr(queryIdx);
-
-        float dist = numeric_limits<float>::max();
-        int bestIdx = -1;
-        
-        for (int i = threadIdx.x; i < allDist_.cols; i += BLOCK_SIZE)
-        {
-            float reg = allDist[i];
-            if (reg < dist)
-            {
-                dist = reg;
-                bestIdx = i;
-            }
-        }
-
-        sdist[threadIdx.x] = dist;
-        strainIdx[threadIdx.x] = bestIdx;
-        __syncthreads();
-
-        reducePredVal<BLOCK_SIZE>(sdist, dist, strainIdx, bestIdx, threadIdx.x, less<volatile float>());
-
-        if (threadIdx.x == 0)
-        {
-            if (dist < numeric_limits<float>::max())
-            {
-                allDist[bestIdx] = numeric_limits<float>::max();
-                trainIdx[i] = bestIdx;
-                distance[i] = dist;
-            }
-        }
-    }
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // find knn match kernel caller
-
-    template <int BLOCK_SIZE> void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
-    {
-        const dim3 threads(BLOCK_SIZE, 1, 1);
-        const dim3 grid(trainIdx.rows, 1, 1);
-
-        for (int i = 0; i < k; ++i)
-        {
-            findBestMatch<BLOCK_SIZE><<<grid, threads, 0, stream>>>(allDist, i, trainIdx, distance);
-            cudaSafeCall( cudaGetLastError() );
-        }
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, cudaStream_t stream)
-    {
-        findKnnMatch_caller<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), static_cast<DevMem2Df>(allDist), stream);
-    }
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // knn match Dispatcher
-
-    template <typename Dist, typename T>
-    void knnMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const DevMem2D& mask, 
-        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, 
-        int cc, cudaStream_t stream)
-    {
-        if (mask.data)
-        {
-            if (k == 2)
-            {
-                knnMatch2Dispatcher<Dist>(query, train, SingleMask(mask), trainIdx, distance, cc, stream);
-                return;
-            }
-
-            calcDistanceDispatcher<Dist>(query, train, SingleMask(mask), allDist, stream);
-        }
-        else
-        {
-            if (k == 2)
-            {
-                knnMatch2Dispatcher<Dist>(query, train, WithOutMask(), trainIdx, distance, cc, stream);
-                return;
-            }
-
-            calcDistanceDispatcher<Dist>(query, train, WithOutMask(), allDist, stream);
-        }
-
-        findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
-    }
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // knn match caller
-
-    template <typename T> void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, 
-        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, 
-        int cc, cudaStream_t stream)
-    {
-        knnMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
-    }
-
-    template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-    template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-    template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-    template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-    template void knnMatchL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-    template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-
-    template <typename T> void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, 
-        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
-        int cc, cudaStream_t stream)
-    {
-        knnMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
-    }
-
-    template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-    template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-    template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-    template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-    template void knnMatchL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-    template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-
-    template <typename T> void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
-        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, 
-        int cc, cudaStream_t stream)
-    {
-        knnMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
-    }
-
-    template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-    template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-    template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-    template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-    template void knnMatchHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
-
-///////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////// Radius Match //////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////
-
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
-    __global__ void radiusMatch(const PtrStep_<T> query, const DevMem2D_<T> train, float maxDistance, const Mask mask, 
-        DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
-    {
-        #if __CUDA_ARCH__ >= 110
-
-        __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
-
-        typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
-        
-        const int queryIdx = blockIdx.x;
-        const T* queryDescs = query.ptr(queryIdx);
-
-        const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
-
-        if (trainIdx < train.rows)
-        {
-            const T* trainDescs = train.ptr(trainIdx);
-
-            if (mask(queryIdx, trainIdx))
-            {
-                Dist dist;
-
-                calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
-
-                if (threadIdx.x == 0)
-                {
-                    if (dist < maxDistance)
-                    {
-                        unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1);
-                        if (i < trainIdx_.cols)
-                        {
-                            distance.ptr(queryIdx)[i] = dist;
-                            trainIdx_.ptr(queryIdx)[i] = trainIdx;
-                        }
-                    }
-                }
-            }
-        }
-
-        #endif
-    }
-        
-    ///////////////////////////////////////////////////////////////////////////////
-    // Radius Match kernel caller
-
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
-    void radiusMatch_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
-        const DevMem2Di& trainIdx, const DevMem2D_<unsigned int>& nMatches, const DevMem2Df& distance, 
-        cudaStream_t stream)
-    {
-        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
-        const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
-
-        radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // Radius Match Dispatcher
-
-    template <typename Dist, typename T, typename Mask>
-    void radiusMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
-        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
-        cudaStream_t stream)
-    {
-        radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask, 
-            static_cast<DevMem2Di>(trainIdx), static_cast< const DevMem2D_<unsigned int> >(nMatches), static_cast<DevMem2Df>(distance), 
-            stream);
-    }
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // Radius Match caller
-
-    template <typename T> void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, 
-        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
-        cudaStream_t stream)
-    {
-        if (mask.data)
-        {
-            radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-                trainIdx, nMatches, distance, 
-                stream);
-        }
-        else
-        {
-            radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-                trainIdx, nMatches, distance, 
-                stream);
-        }
-    }
-
-    template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-    template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-    template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-    template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-    template void radiusMatchL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-    template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-
-    template <typename T> void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, 
-        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
-        cudaStream_t stream)
-    {
-        if (mask.data)
-        {
-            radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-                trainIdx, nMatches, distance, 
-                stream);
-        }
-        else
-        {
-            radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-                trainIdx, nMatches, distance, 
-                stream);
-        }
-    }
-
-    template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-    template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-    template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-    template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-    template void radiusMatchL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-    template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-
-    template <typename T> void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, 
-        const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, 
-        cudaStream_t stream)
-    {
-        if (mask.data)
-        {
-            radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-                trainIdx, nMatches, distance, 
-                stream);
-        }
-        else
-        {
-            radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-                trainIdx, nMatches, distance, 
-                stream);
-        }
-    }
-
-    template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-    template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-    template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-    template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-    template void radiusMatchHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
-}}}
diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu
new file mode 100644
index 0000000000..3c32769c52
--- /dev/null
+++ b/modules/gpu/src/cuda/column_filter.cu
@@ -0,0 +1,240 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+#define MAX_KERNEL_SIZE 16
+#define BLOCK_DIM_X 16
+#define BLOCK_DIM_Y 16
+
+namespace filter_krnls_column
+{
+    __constant__ float cLinearKernel[MAX_KERNEL_SIZE];
+
+    void loadLinearKernel(const float kernel[], int ksize)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(cLinearKernel, kernel, ksize * sizeof(float)) );
+    }
+
+    template <int ksize, typename T, typename D, typename B>
+    __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
+    {
+        __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
+
+        const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
+        const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
+
+        T* sDataColumn = smem + threadIdx.x;
+
+        if (x < src.cols)
+        {
+            const T* srcCol = src.ptr() + x;
+
+            sDataColumn[ threadIdx.y                    * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step);
+            sDataColumn[(threadIdx.y + BLOCK_DIM_Y)     * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step);
+            sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step);
+
+            __syncthreads();
+
+            if (y < src.rows)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+                sum_t sum = VecTraits<sum_t>::all(0);
+
+                sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X;
+
+                #pragma unroll
+                for(int i = 0; i < ksize; ++i)
+                    sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i];
+
+                dst.ptr(y)[x] = saturate_cast<D>(sum);
+            }
+        }
+    }
+}
+
+namespace cv { namespace gpu { namespace filters
+{
+    template <int ksize, typename T, typename D, template<typename> class B>
+    void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
+    {
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+        dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
+
+        B<T> b(src.rows);
+
+        if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))
+        {
+            cv::gpu::error("linearColumnFilter: can't use specified border extrapolation, image is too small, "
+                           "try bigger image or another border extrapolation mode", __FILE__, __LINE__);
+        }
+
+        filter_krnls_column::linearColumnFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T, typename D>
+    void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
+        static const caller_t callers[5][17] = 
+        {
+            {
+                0, 
+                linearColumnFilter_caller<1 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
+                linearColumnFilter_caller<3 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<4 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<5 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<6 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<7 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<8 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<9 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<10, T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<11, T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<12, T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<13, T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<14, T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<15, T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<16, T, D, BrdColReflect101> 
+            },
+            {
+                0, 
+                linearColumnFilter_caller<1 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
+                linearColumnFilter_caller<3 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<4 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<5 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<6 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<7 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<8 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<9 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<10, T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<11, T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<12, T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<13, T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<14, T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<15, T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<16, T, D, BrdColReplicate>
+            },
+            {
+                0, 
+                linearColumnFilter_caller<1 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<2 , T, D, BrdColConstant>,
+                linearColumnFilter_caller<3 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<4 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<5 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<6 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<7 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<8 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<9 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<10, T, D, BrdColConstant>, 
+                linearColumnFilter_caller<11, T, D, BrdColConstant>, 
+                linearColumnFilter_caller<12, T, D, BrdColConstant>, 
+                linearColumnFilter_caller<13, T, D, BrdColConstant>, 
+                linearColumnFilter_caller<14, T, D, BrdColConstant>, 
+                linearColumnFilter_caller<15, T, D, BrdColConstant>, 
+                linearColumnFilter_caller<16, T, D, BrdColConstant> 
+            },
+            {
+                0, 
+                linearColumnFilter_caller<1 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<2 , T, D, BrdColReflect>,
+                linearColumnFilter_caller<3 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<4 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<5 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<6 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<7 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<8 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<9 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<10, T, D, BrdColReflect>, 
+                linearColumnFilter_caller<11, T, D, BrdColReflect>, 
+                linearColumnFilter_caller<12, T, D, BrdColReflect>, 
+                linearColumnFilter_caller<13, T, D, BrdColReflect>, 
+                linearColumnFilter_caller<14, T, D, BrdColReflect>, 
+                linearColumnFilter_caller<15, T, D, BrdColReflect>, 
+                linearColumnFilter_caller<16, T, D, BrdColReflect>
+            },
+            {
+                0, 
+                linearColumnFilter_caller<1 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<2 , T, D, BrdColWrap>,
+                linearColumnFilter_caller<3 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<4 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<5 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<6 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<7 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<8 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<9 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<10, T, D, BrdColWrap>, 
+                linearColumnFilter_caller<11, T, D, BrdColWrap>, 
+                linearColumnFilter_caller<12, T, D, BrdColWrap>, 
+                linearColumnFilter_caller<13, T, D, BrdColWrap>, 
+                linearColumnFilter_caller<14, T, D, BrdColWrap>, 
+                linearColumnFilter_caller<15, T, D, BrdColWrap>, 
+                linearColumnFilter_caller<16, T, D, BrdColWrap>,
+            }
+        };
+        
+        filter_krnls_column::loadLinearKernel(kernel, ksize);
+
+        callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
+    }
+
+    template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    //template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    //template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    template void linearColumnFilter_gpu<float3, short3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    template void linearColumnFilter_gpu<float , int   >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+}}}
diff --git a/modules/gpu/src/cuda/filters.cu b/modules/gpu/src/cuda/filters.cu
deleted file mode 100644
index 41f403a281..0000000000
--- a/modules/gpu/src/cuda/filters.cu
+++ /dev/null
@@ -1,633 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::device;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Linear filters
-
-#define MAX_KERNEL_SIZE 16
-#define BLOCK_DIM_X 16
-#define BLOCK_DIM_Y 16
-
-namespace filter_krnls
-{
-    __constant__ float cLinearKernel[MAX_KERNEL_SIZE];
-}
-
-namespace cv { namespace gpu { namespace filters
-{
-    void loadLinearKernel(const float kernel[], int ksize)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(filter_krnls::cLinearKernel, kernel, ksize * sizeof(float)) );
-    }
-}}}
-
-namespace filter_krnls
-{
-    template <typename T, size_t size> struct SmemType_
-    {
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
-    };
-    template <typename T> struct SmemType_<T, 4>
-    {
-        typedef T smem_t;
-    };
-    template <typename T> struct SmemType
-    {
-        typedef typename SmemType_<T, sizeof(T)>::smem_t smem_t;
-    };
-
-    template <int ksize, typename T, typename D, typename B>
-    __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
-    {
-        typedef typename SmemType<T>::smem_t smem_t;
-
-        __shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
-
-        const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
-        const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
-
-        smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3;
-
-        if (y < src.rows)
-        {
-            const T* rowSrc = src.ptr(y);
-
-            sDataRow[threadIdx.x                  ] = b.at_low(x - BLOCK_DIM_X, rowSrc);
-            sDataRow[threadIdx.x + BLOCK_DIM_X    ] = b.at_high(x, rowSrc);
-            sDataRow[threadIdx.x + BLOCK_DIM_X * 2] = b.at_high(x + BLOCK_DIM_X, rowSrc);
-
-            __syncthreads();
-
-            if (x < src.cols)
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-                sum_t sum = VecTraits<sum_t>::all(0);
-
-                sDataRow += threadIdx.x + BLOCK_DIM_X - anchor;
-
-                #pragma unroll
-                for(int i = 0; i < ksize; ++i)
-                    sum = sum + sDataRow[i] * cLinearKernel[i];
-
-                dst.ptr(y)[x] = saturate_cast<D>(sum);
-            }
-        }
-    }
-}
-
-namespace cv { namespace gpu { namespace filters
-{
-    template <int ksize, typename T, typename D, template<typename> class B>
-    void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
-    {
-        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
-        dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
-
-        typedef typename filter_krnls::SmemType<T>::smem_t smem_t;
-        B<smem_t> b(src.cols);
-
-        if (!b.is_range_safe(-BLOCK_DIM_X, (grid.x + 1) * BLOCK_DIM_X - 1))
-        {
-            cv::gpu::error("linearRowFilter: can't use specified border extrapolation, image is too small, "
-                           "try bigger image or another border extrapolation mode", __FILE__, __LINE__);
-        }
-
-        filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <typename T, typename D>
-    void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
-        static const caller_t callers[5][17] = 
-        {
-            {
-                0, 
-                linearRowFilter_caller<1 , T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
-                linearRowFilter_caller<3 , T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<4 , T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<5 , T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<6 , T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
-                linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
-                linearRowFilter_caller<9 , T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<10, T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<11, T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<12, T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<13, T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<14, T, D, BrdRowReflect101>,
-                linearRowFilter_caller<15, T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<16, T, D, BrdRowReflect101>
-            },
-            {
-                0, 
-                linearRowFilter_caller<1 , T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
-                linearRowFilter_caller<3 , T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<4 , T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<5 , T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<6 , T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<7 , T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
-                linearRowFilter_caller<9 , T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<10, T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<11, T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<12, T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<13, T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<14, T, D, BrdRowReplicate>,
-                linearRowFilter_caller<15, T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<16, T, D, BrdRowReplicate>
-            },
-            {
-                0, 
-                linearRowFilter_caller<1 , T, D, BrdRowConstant>, 
-                linearRowFilter_caller<2 , T, D, BrdRowConstant>,
-                linearRowFilter_caller<3 , T, D, BrdRowConstant>, 
-                linearRowFilter_caller<4 , T, D, BrdRowConstant>, 
-                linearRowFilter_caller<5 , T, D, BrdRowConstant>, 
-                linearRowFilter_caller<6 , T, D, BrdRowConstant>, 
-                linearRowFilter_caller<7 , T, D, BrdRowConstant>, 
-                linearRowFilter_caller<8 , T, D, BrdRowConstant>,
-                linearRowFilter_caller<9 , T, D, BrdRowConstant>,
-                linearRowFilter_caller<10, T, D, BrdRowConstant>, 
-                linearRowFilter_caller<11, T, D, BrdRowConstant>, 
-                linearRowFilter_caller<12, T, D, BrdRowConstant>, 
-                linearRowFilter_caller<13, T, D, BrdRowConstant>,
-                linearRowFilter_caller<14, T, D, BrdRowConstant>,
-                linearRowFilter_caller<15, T, D, BrdRowConstant>, 
-                linearRowFilter_caller<16, T, D, BrdRowConstant>
-            },
-            {
-                0, 
-                linearRowFilter_caller<1 , T, D, BrdRowReflect>, 
-                linearRowFilter_caller<2 , T, D, BrdRowReflect>,
-                linearRowFilter_caller<3 , T, D, BrdRowReflect>, 
-                linearRowFilter_caller<4 , T, D, BrdRowReflect>, 
-                linearRowFilter_caller<5 , T, D, BrdRowReflect>, 
-                linearRowFilter_caller<6 , T, D, BrdRowReflect>, 
-                linearRowFilter_caller<7 , T, D, BrdRowReflect>, 
-                linearRowFilter_caller<8 , T, D, BrdRowReflect>,
-                linearRowFilter_caller<9 , T, D, BrdRowReflect>,
-                linearRowFilter_caller<10, T, D, BrdRowReflect>, 
-                linearRowFilter_caller<11, T, D, BrdRowReflect>, 
-                linearRowFilter_caller<12, T, D, BrdRowReflect>, 
-                linearRowFilter_caller<13, T, D, BrdRowReflect>,
-                linearRowFilter_caller<14, T, D, BrdRowReflect>,
-                linearRowFilter_caller<15, T, D, BrdRowReflect>, 
-                linearRowFilter_caller<16, T, D, BrdRowReflect>
-            },
-            {
-                0, 
-                linearRowFilter_caller<1 , T, D, BrdRowWrap>, 
-                linearRowFilter_caller<2 , T, D, BrdRowWrap>,
-                linearRowFilter_caller<3 , T, D, BrdRowWrap>, 
-                linearRowFilter_caller<4 , T, D, BrdRowWrap>, 
-                linearRowFilter_caller<5 , T, D, BrdRowWrap>, 
-                linearRowFilter_caller<6 , T, D, BrdRowWrap>, 
-                linearRowFilter_caller<7 , T, D, BrdRowWrap>, 
-                linearRowFilter_caller<8 , T, D, BrdRowWrap>,
-                linearRowFilter_caller<9 , T, D, BrdRowWrap>,
-                linearRowFilter_caller<10, T, D, BrdRowWrap>, 
-                linearRowFilter_caller<11, T, D, BrdRowWrap>, 
-                linearRowFilter_caller<12, T, D, BrdRowWrap>, 
-                linearRowFilter_caller<13, T, D, BrdRowWrap>,
-                linearRowFilter_caller<14, T, D, BrdRowWrap>,
-                linearRowFilter_caller<15, T, D, BrdRowWrap>, 
-                linearRowFilter_caller<16, T, D, BrdRowWrap>
-            }
-        };
-        
-        loadLinearKernel(kernel, ksize);
-
-        callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
-    }
-
-    template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearRowFilter_gpu<short3, float3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearRowFilter_gpu<int   , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-}}}
-
-namespace filter_krnls
-{
-    template <int ksize, typename T, typename D, typename B>
-    __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
-    {
-        __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
-
-        const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
-        const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
-
-        T* sDataColumn = smem + threadIdx.x;
-
-        if (x < src.cols)
-        {
-            const T* srcCol = src.ptr() + x;
-
-            sDataColumn[ threadIdx.y                    * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step);
-            sDataColumn[(threadIdx.y + BLOCK_DIM_Y)     * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step);
-            sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step);
-
-            __syncthreads();
-
-            if (y < src.rows)
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-                sum_t sum = VecTraits<sum_t>::all(0);
-
-                sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X;
-
-                #pragma unroll
-                for(int i = 0; i < ksize; ++i)
-                    sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i];
-
-                dst.ptr(y)[x] = saturate_cast<D>(sum);
-            }
-        }
-    }
-}
-
-namespace cv { namespace gpu { namespace filters
-{
-    template <int ksize, typename T, typename D, template<typename> class B>
-    void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
-    {
-        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
-        dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
-
-        B<T> b(src.rows);
-
-        if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))
-        {
-            cv::gpu::error("linearColumnFilter: can't use specified border extrapolation, image is too small, "
-                           "try bigger image or another border extrapolation mode", __FILE__, __LINE__);
-        }
-
-        filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <typename T, typename D>
-    void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
-        static const caller_t callers[5][17] = 
-        {
-            {
-                0, 
-                linearColumnFilter_caller<1 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
-                linearColumnFilter_caller<3 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<4 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<5 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<6 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<7 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<8 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<9 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<10, T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<11, T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<12, T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<13, T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<14, T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<15, T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<16, T, D, BrdColReflect101> 
-            },
-            {
-                0, 
-                linearColumnFilter_caller<1 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
-                linearColumnFilter_caller<3 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<4 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<5 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<6 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<7 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<8 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<9 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<10, T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<11, T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<12, T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<13, T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<14, T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<15, T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<16, T, D, BrdColReplicate>
-            },
-            {
-                0, 
-                linearColumnFilter_caller<1 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<2 , T, D, BrdColConstant>,
-                linearColumnFilter_caller<3 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<4 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<5 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<6 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<7 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<8 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<9 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<10, T, D, BrdColConstant>, 
-                linearColumnFilter_caller<11, T, D, BrdColConstant>, 
-                linearColumnFilter_caller<12, T, D, BrdColConstant>, 
-                linearColumnFilter_caller<13, T, D, BrdColConstant>, 
-                linearColumnFilter_caller<14, T, D, BrdColConstant>, 
-                linearColumnFilter_caller<15, T, D, BrdColConstant>, 
-                linearColumnFilter_caller<16, T, D, BrdColConstant> 
-            },
-            {
-                0, 
-                linearColumnFilter_caller<1 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<2 , T, D, BrdColReflect>,
-                linearColumnFilter_caller<3 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<4 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<5 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<6 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<7 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<8 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<9 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<10, T, D, BrdColReflect>, 
-                linearColumnFilter_caller<11, T, D, BrdColReflect>, 
-                linearColumnFilter_caller<12, T, D, BrdColReflect>, 
-                linearColumnFilter_caller<13, T, D, BrdColReflect>, 
-                linearColumnFilter_caller<14, T, D, BrdColReflect>, 
-                linearColumnFilter_caller<15, T, D, BrdColReflect>, 
-                linearColumnFilter_caller<16, T, D, BrdColReflect>
-            },
-            {
-                0, 
-                linearColumnFilter_caller<1 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<2 , T, D, BrdColWrap>,
-                linearColumnFilter_caller<3 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<4 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<5 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<6 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<7 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<8 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<9 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<10, T, D, BrdColWrap>, 
-                linearColumnFilter_caller<11, T, D, BrdColWrap>, 
-                linearColumnFilter_caller<12, T, D, BrdColWrap>, 
-                linearColumnFilter_caller<13, T, D, BrdColWrap>, 
-                linearColumnFilter_caller<14, T, D, BrdColWrap>, 
-                linearColumnFilter_caller<15, T, D, BrdColWrap>, 
-                linearColumnFilter_caller<16, T, D, BrdColWrap>,
-            }
-        };
-        
-        loadLinearKernel(kernel, ksize);
-
-        callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
-    }
-
-    template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearColumnFilter_gpu<float3, short3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearColumnFilter_gpu<float , int   >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-}}}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Bilateral filters
-
-namespace bf_krnls
-{
-    __constant__ float* ctable_color;
-    __constant__ float* ctable_space;
-    __constant__ size_t ctable_space_step;
-
-    __constant__ int cndisp;
-    __constant__ int cradius;
-
-    __constant__ short cedge_disc;
-    __constant__ short cmax_disc;
-}
-
-namespace cv { namespace gpu { namespace bf 
-{
-    void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
-        size_t table_space_step = table_space.step / sizeof(float);
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
-
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
-
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
-    }
-}}}
-
-namespace bf_krnls
-{
-    template <int channels>
-    struct DistRgbMax
-    {
-        static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
-        {
-            uchar x = abs(a[0] - b[0]);
-            uchar y = abs(a[1] - b[1]);
-            uchar z = abs(a[2] - b[2]);
-            return (max(max(x, y), z));
-        }
-    };
-
-    template <>
-    struct DistRgbMax<1>
-    {
-        static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
-        {
-            return abs(a[0] - b[0]);
-        }
-    };
-
-    template <int channels, typename T>
-    __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
-    {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
-
-        T dp[5];
-
-        if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
-        {
-            dp[0] = *(disp + (y  ) * disp_step + x + 0);
-            dp[1] = *(disp + (y-1) * disp_step + x + 0);
-            dp[2] = *(disp + (y  ) * disp_step + x - 1);
-            dp[3] = *(disp + (y+1) * disp_step + x + 0);
-            dp[4] = *(disp + (y  ) * disp_step + x + 1);
-
-            if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc)            
-            {
-                const int ymin = max(0, y - cradius);
-                const int xmin = max(0, x - cradius);
-                const int ymax = min(h - 1, y + cradius);
-                const int xmax = min(w - 1, x + cradius);
-
-                float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-
-                const uchar* ic = img + y * img_step + channels * x;
-
-                for(int yi = ymin; yi <= ymax; yi++)
-                {
-                    const T* disp_y = disp + yi * disp_step;
-
-                    for(int xi = xmin; xi <= xmax; xi++)
-                    {
-                        const uchar* in = img + yi * img_step + channels * xi;
-
-                        uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
-
-                        const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)];
-
-                        const T disp_reg = disp_y[xi];
-
-                        cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight;
-                        cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight;
-                        cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight;
-                        cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight;
-                        cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight;
-                    }
-                }
-
-                float minimum = numeric_limits<float>::max();
-                int id = 0;
-
-                if (cost[0] < minimum)
-                {
-                    minimum = cost[0];
-                    id = 0;
-                }
-                if (cost[1] < minimum)
-                {
-                    minimum = cost[1];
-                    id = 1;
-                }
-                if (cost[2] < minimum)
-                {
-                    minimum = cost[2];
-                    id = 2;
-                }
-                if (cost[3] < minimum)
-                {
-                    minimum = cost[3];
-                    id = 3;
-                }
-                if (cost[4] < minimum)
-                {
-                    minimum = cost[4];
-                    id = 4;
-                }
-
-                *(disp + y * disp_step + x) = dp[id];
-            }
-        }
-    }
-}
-
-namespace cv { namespace gpu { namespace bf 
-{
-    template <typename T>     
-    void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-        grid.x = divUp(disp.cols, threads.x << 1);
-        grid.y = divUp(disp.rows, threads.y);
-
-        switch (channels)
-        {
-        case 1:
-            for (int i = 0; i < iters; ++i)
-            {
-                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                cudaSafeCall( cudaGetLastError() );
-                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                cudaSafeCall( cudaGetLastError() );
-            }
-            break;
-        case 3:
-            for (int i = 0; i < iters; ++i)
-            {
-                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                cudaSafeCall( cudaGetLastError() );
-                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                cudaSafeCall( cudaGetLastError() );
-            }
-            break;
-        default:
-            cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
-        }
-
-        if (stream != 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
-    {
-        bilateral_filter_caller(disp, img, channels, iters, stream);
-    }
-
-    void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
-    {
-        bilateral_filter_caller(disp, img, channels, iters, stream);
-    }
-}}}
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
index d76f93b9a5..67cfa59b42 100644
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -41,433 +41,16 @@
 //M*/
 
 #include "internal_shared.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
 #include "opencv2/gpu/device/vec_traits.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/filters.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
 
 using namespace cv::gpu;
 using namespace cv::gpu::device;
 
-/////////////////////////////////// Remap ///////////////////////////////////////////////
 namespace cv { namespace gpu { namespace imgproc
 {
-    template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
-    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < dst.cols && y < dst.rows)
-        {
-            const float xcoo = mapx.ptr(y)[x];
-            const float ycoo = mapy.ptr(y)[x];
-
-            dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-        }
-    }
-
-    template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
-    {
-        static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
-            
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-            BorderReader< PtrStep_<T>, B<work_type> > brdSrc(src, brd);
-            Filter< BorderReader< PtrStep_<T>, B<work_type> > > filter_src(brdSrc);
-
-            remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
-            cudaSafeCall( cudaGetLastError() );
-        }
-    };
-    
-    template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
-    {
-        static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
-            
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-            BorderReader< PtrStep_<T>, B<work_type> > brdSrc(src, brd);
-            Filter< BorderReader< PtrStep_<T>, B<work_type> > > filter_src(brdSrc);
-
-            remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
-    texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-    struct tex_remap_ ## type ## _reader \
-    { \
-        typedef type elem_type; \
-        typedef int index_type; \
-        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-        { \
-            return tex2D(tex_remap_ ## type , x, y); \
-        } \
-    }; \
-    template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
-    { \
-        static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue) \
-        { \
-            typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-            dim3 block(32, 8); \
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            TextureBinder texHandler(&tex_remap_ ## type , src); \
-            tex_remap_ ## type ##_reader texSrc; \
-            B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-            BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-            Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-            remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-            cudaSafeCall( cudaGetLastError() ); \
-            cudaSafeCall( cudaDeviceSynchronize() ); \
-        } \
-    }; \
-    template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
-    { \
-        static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*) \
-        { \
-            dim3 block(32, 8); \
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            TextureBinder texHandler(&tex_remap_ ## type , src); \
-            tex_remap_ ## type ##_reader texSrc; \
-            Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
-            remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-            cudaSafeCall( cudaGetLastError() ); \
-            cudaSafeCall( cudaDeviceSynchronize() ); \
-        } \
-    };
-    
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
-    
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
-    
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
-    
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
-    
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
-    
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
-    
-#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
-
-    template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
-    { 
-        static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream)
-        {
-            if (stream == 0)
-                RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue);
-            else
-                RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream);
-        }
-    };
-
-    template <typename T> void remap_gpu(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream);
-
-        static const caller_t callers[3][5] = 
-        {
-            { 
-                RemapDispatcher<PointFilter, BrdReflect101, T>::call, 
-                RemapDispatcher<PointFilter, BrdReplicate, T>::call, 
-                RemapDispatcher<PointFilter, BrdConstant, T>::call, 
-                RemapDispatcher<PointFilter, BrdReflect, T>::call, 
-                RemapDispatcher<PointFilter, BrdWrap, T>::call 
-            },
-            { 
-                RemapDispatcher<LinearFilter, BrdReflect101, T>::call, 
-                RemapDispatcher<LinearFilter, BrdReplicate, T>::call, 
-                RemapDispatcher<LinearFilter, BrdConstant, T>::call, 
-                RemapDispatcher<LinearFilter, BrdReflect, T>::call, 
-                RemapDispatcher<LinearFilter, BrdWrap, T>::call 
-            },
-            { 
-                RemapDispatcher<CubicFilter, BrdReflect101, T>::call, 
-                RemapDispatcher<CubicFilter, BrdReplicate, T>::call, 
-                RemapDispatcher<CubicFilter, BrdConstant, T>::call, 
-                RemapDispatcher<CubicFilter, BrdReflect, T>::call, 
-                RemapDispatcher<CubicFilter, BrdWrap, T>::call 
-            }
-        };
-
-        callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream);
-    }
-
-    template void remap_gpu<uchar >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<uchar2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<uchar3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<uchar4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    
-    template void remap_gpu<schar>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<char2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<char3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<char4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    
-    template void remap_gpu<ushort >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<ushort2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<ushort3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<ushort4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    
-    template void remap_gpu<short >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<short2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<short3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<short4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    
-    template void remap_gpu<uint >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<uint2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<uint3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<uint4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    
-    template void remap_gpu<int >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<int2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<int3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<int4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    
-    template void remap_gpu<float >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<float2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<float3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void remap_gpu<float4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
-
-/////////////////////////////////// Resize ///////////////////////////////////////////////
-
-    template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
-    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < dst.cols && y < dst.rows)
-        {
-            const float xcoo = x / fx;
-            const float ycoo = y / fy;
-
-            dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-        }
-    }
-    template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
-    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < dst.cols && y < dst.rows)
-        {
-            const float xcoo = x / fx;
-            const float ycoo = y / fy;
-
-            dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
-        }
-    }
-
-    template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
-    {
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
-        {            
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            BrdReplicate<T> brd(src.rows, src.cols);
-            BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
-            Filter< BorderReader< PtrStep_<T>, BrdReplicate<T> > > filter_src(brdSrc);
-
-            resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
-            cudaSafeCall( cudaGetLastError() );
-        }
-    };
-    template <typename T> struct ResizeDispatcherStream<PointFilter, T>
-    {
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
-        {            
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            BrdReplicate<T> brd(src.rows, src.cols);
-            BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
-
-            resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
-            cudaSafeCall( cudaGetLastError() );
-        }
-    };
-    
-    template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
-    {
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
-        {            
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            BrdReplicate<T> brd(src.rows, src.cols);
-            BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
-            Filter< BorderReader< PtrStep_<T>, BrdReplicate<T> > > filter_src(brdSrc);
-
-            resize<<<grid, block>>>(filter_src, fx, fy, dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
-    {
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
-        {            
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            BrdReplicate<T> brd(src.rows, src.cols);
-            BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
-
-            resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
-    texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-    struct tex_resize_ ## type ## _reader \
-    { \
-        typedef type elem_type; \
-        typedef int index_type; \
-        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-        { \
-            return tex2D(tex_resize_ ## type , x, y); \
-        } \
-    }; \
-    template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
-    { \
-        static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
-        { \
-            dim3 block(32, 8); \
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            TextureBinder texHandler(&tex_resize_ ## type , src); \
-            tex_resize_ ## type ##_reader texSrc; \
-            Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
-            resize<<<grid, block>>>(filter_src, fx, fy, dst); \
-            cudaSafeCall( cudaGetLastError() ); \
-            cudaSafeCall( cudaDeviceSynchronize() ); \
-        } \
-    }; \
-    template <> struct ResizeDispatcherNonStream<PointFilter, type> \
-    { \
-        static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
-        { \
-            dim3 block(32, 8); \
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            TextureBinder texHandler(&tex_resize_ ## type , src); \
-            tex_resize_ ## type ##_reader texSrc; \
-            resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
-            cudaSafeCall( cudaGetLastError() ); \
-            cudaSafeCall( cudaDeviceSynchronize() ); \
-        } \
-    };
-    
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
-    
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
-    
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
-    
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
-    
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
-    
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
-    
-#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
-
-    template <template <typename> class Filter, typename T> struct ResizeDispatcher
-    { 
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
-        {
-            if (stream == 0)
-                ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
-            else
-                ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
-        }
-    };
-
-    template <typename T> void resize_gpu(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
-
-        static const caller_t callers[3] = 
-        {
-            ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
-        };
-
-        callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
-    }
-
-    template void resize_gpu<uchar >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<uchar2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<uchar3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<uchar4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    
-    template void resize_gpu<schar>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<char2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<char3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<char4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    
-    template void resize_gpu<ushort >(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<ushort2>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<ushort3>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<ushort4>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    
-    template void resize_gpu<short >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<short2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<short3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<short4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    
-    template void resize_gpu<uint >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<uint2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<uint3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<uint4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    
-    template void resize_gpu<int >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<int2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<int3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<int4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    
-    template void resize_gpu<float >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<float2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<float3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<float4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-
 /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
 
     texture<uchar4, 2> tex_meanshift;
@@ -1199,363 +782,7 @@ namespace cv { namespace gpu { namespace imgproc
         cudaSafeCall( cudaGetLastError() );
 
         cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-
-    /////////////////////////////////////////////////////////////////////////
-    // downsample
-
-    template <typename T, int cn>
-    __global__ void downsampleKernel(const PtrStep_<T> src, DevMem2D_<T> dst)
-    {
-        int x = blockIdx.x * blockDim.x + threadIdx.x;
-        int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x < dst.cols && y < dst.rows)
-        {
-            int ch_x = x / cn;
-            dst.ptr(y)[x] = src.ptr(y*2)[ch_x*2*cn + x - ch_x*cn];
-        }
-    }
-
-
-    template <typename T, int cn>
-    void downsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream)
-    {
-        dim3 threads(32, 8);
-        dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
-
-        downsampleKernel<T,cn><<<grid, threads, 0, stream>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
-        cudaSafeCall(cudaGetLastError());
-        
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    template void downsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<short,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<short,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<short,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<short,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<float,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<float,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<float,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<float,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-
-
-    //////////////////////////////////////////////////////////////////////////
-    // upsample
-
-    template <typename T, int cn>
-    __global__ void upsampleKernel(const PtrStep_<T> src, DevMem2D_<T> dst)
-    {
-        int x = blockIdx.x * blockDim.x + threadIdx.x;
-        int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x < dst.cols && y < dst.rows)
-        {
-            int ch_x = x / cn;
-            T val = ((ch_x & 1) || (y & 1)) ? 0 : src.ptr(y/2)[ch_x/2*cn + x - ch_x*cn];
-            dst.ptr(y)[x] = val;
-        }
-    }
-
-
-    template <typename T, int cn>
-    void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream)
-    {
-        dim3 threads(32, 8);
-        dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
-
-        upsampleKernel<T,cn><<<grid, threads, 0, stream>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    template void upsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<short,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<short,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<short,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<short,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<float,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<float,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<float,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<float,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-
-    //////////////////////////////////////////////////////////////////////////
-    // pyrDown
-
-    template <typename T, typename B> __global__ void pyrDown(const PtrStep_<T> src, PtrStep_<T> dst, const B b, int dst_cols)
-    {
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y;
-
-        __shared__ value_type smem[256 + 4];
-
-        value_type sum;
-        
-        const int src_y = 2*y;
-
-        sum = VecTraits<value_type>::all(0);
-        
-        sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
-        sum = sum + 0.25f   * b.at(src_y - 1, x, src.data, src.step);
-        sum = sum + 0.375f  * b.at(src_y    , x, src.data, src.step);
-        sum = sum + 0.25f   * b.at(src_y + 1, x, src.data, src.step);
-        sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
-
-        smem[2 + threadIdx.x] = sum;
-
-        if (threadIdx.x < 2)
-        {
-            const int left_x = x - 2 + threadIdx.x;
-
-            sum = VecTraits<value_type>::all(0);
-        
-            sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
-            sum = sum + 0.25f   * b.at(src_y - 1, left_x, src.data, src.step);
-            sum = sum + 0.375f  * b.at(src_y    , left_x, src.data, src.step);
-            sum = sum + 0.25f   * b.at(src_y + 1, left_x, src.data, src.step);
-            sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
-
-            smem[threadIdx.x] = sum;
-        }
-
-        if (threadIdx.x > 253)
-        {
-            const int right_x = x + threadIdx.x + 2;
-
-            sum = VecTraits<value_type>::all(0);
-        
-            sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
-            sum = sum + 0.25f   * b.at(src_y - 1, right_x, src.data, src.step);
-            sum = sum + 0.375f  * b.at(src_y    , right_x, src.data, src.step);
-            sum = sum + 0.25f   * b.at(src_y + 1, right_x, src.data, src.step);
-            sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
-
-            smem[4 + threadIdx.x] = sum;
-        }
-
-        __syncthreads();
-
-        if (threadIdx.x < 128)
-        {
-            const int tid2 = threadIdx.x * 2;
-
-            sum = VecTraits<value_type>::all(0);
-
-            sum = sum + 0.0625f * smem[2 + tid2 - 2];
-            sum = sum + 0.25f   * smem[2 + tid2 - 1];
-            sum = sum + 0.375f  * smem[2 + tid2    ];
-            sum = sum + 0.25f   * smem[2 + tid2 + 1];
-            sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
-            const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
-
-            if (dst_x < dst_cols)
-                dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
-        }
-    }
-
-    template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
-    {
-        const dim3 block(256);
-        const dim3 grid(divUp(src.cols, block.x), dst.rows);
-
-        B<T> b(src.rows, src.cols);
-
-        pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
-    {
-        typedef typename TypeVec<T, cn>::vec_type type;
-
-        typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
-
-        static const caller_t callers[] = 
-        {
-            pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
-        };
-
-        callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
-    }
-
-    template void pyrDown_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-
-    template void pyrDown_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-
-    template void pyrDown_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-
-    template void pyrDown_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-
-    template void pyrDown_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-
-    template void pyrDown_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-
-    //////////////////////////////////////////////////////////////////////////
-    // pyrUp
-
-    template <typename T, typename B> __global__ void pyrUp(const PtrStep_<T> src, DevMem2D_<T> dst, const B b)
-    {
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        __shared__ T smem1[10][10];
-        __shared__ value_type smem2[20][16];
-
-        value_type sum;
-
-        if (threadIdx.x < 10 && threadIdx.y < 10)
-            smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
-
-        __syncthreads();
-
-        const int tidx = threadIdx.x;
-
-        sum = VecTraits<value_type>::all(0);
-
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
-        sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[1 + threadIdx.y / 2][1 + ((tidx    ) >> 1)];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
-
-        smem2[2 + threadIdx.y][tidx] = sum;
-
-        if (threadIdx.y < 2)
-        {
-            sum = VecTraits<value_type>::all(0);
-
-            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
-            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx - 1) >> 1)];
-            sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[0][1 + ((tidx    ) >> 1)];
-            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx + 1) >> 1)];
-            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
-
-            smem2[threadIdx.y][tidx] = sum;
-        }
-
-        if (threadIdx.y > 13)
-        {
-            sum = VecTraits<value_type>::all(0);
-
-            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
-            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx - 1) >> 1)];
-            sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[9][1 + ((tidx    ) >> 1)];
-            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx + 1) >> 1)];
-            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
-
-            smem2[4 + threadIdx.y][tidx] = sum;
-        }
-
-        __syncthreads();
-
-        sum = VecTraits<value_type>::all(0);
-
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y - 1][tidx];
-        sum = sum + (tidx % 2 == 0) * 0.375f  * smem2[2 + threadIdx.y    ][tidx];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y + 1][tidx];
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
-
-        if (x < dst.cols && y < dst.rows)
-            dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
-    }
-
-    template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
-    {
-        const dim3 block(16, 16);
-        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-        B<T> b(src.rows, src.cols);
-
-        pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
-    {
-        typedef typename TypeVec<T, cn>::vec_type type;
-
-        typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
-
-        static const caller_t callers[] = 
-        {
-            pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
-        };
-
-        callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
-    }
-
-    template void pyrUp_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-
-    template void pyrUp_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-
-    template void pyrUp_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-
-    template void pyrUp_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-
-    template void pyrUp_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-
-    template void pyrUp_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    }    
 
     //////////////////////////////////////////////////////////////////////////
     // buildWarpMaps
diff --git a/modules/gpu/src/cuda/pyr_down.cu b/modules/gpu/src/cuda/pyr_down.cu
new file mode 100644
index 0000000000..d701bec6b9
--- /dev/null
+++ b/modules/gpu/src/cuda/pyr_down.cu
@@ -0,0 +1,185 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace cv { namespace gpu { namespace imgproc
+{
+    template <typename T, typename B> __global__ void pyrDown(const PtrStep_<T> src, PtrStep_<T> dst, const B b, int dst_cols)
+    {
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y;
+
+        __shared__ value_type smem[256 + 4];
+
+        value_type sum;
+        
+        const int src_y = 2*y;
+
+        sum = VecTraits<value_type>::all(0);
+        
+        sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
+        sum = sum + 0.25f   * b.at(src_y - 1, x, src.data, src.step);
+        sum = sum + 0.375f  * b.at(src_y    , x, src.data, src.step);
+        sum = sum + 0.25f   * b.at(src_y + 1, x, src.data, src.step);
+        sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
+
+        smem[2 + threadIdx.x] = sum;
+
+        if (threadIdx.x < 2)
+        {
+            const int left_x = x - 2 + threadIdx.x;
+
+            sum = VecTraits<value_type>::all(0);
+        
+            sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
+            sum = sum + 0.25f   * b.at(src_y - 1, left_x, src.data, src.step);
+            sum = sum + 0.375f  * b.at(src_y    , left_x, src.data, src.step);
+            sum = sum + 0.25f   * b.at(src_y + 1, left_x, src.data, src.step);
+            sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
+
+            smem[threadIdx.x] = sum;
+        }
+
+        if (threadIdx.x > 253)
+        {
+            const int right_x = x + threadIdx.x + 2;
+
+            sum = VecTraits<value_type>::all(0);
+        
+            sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
+            sum = sum + 0.25f   * b.at(src_y - 1, right_x, src.data, src.step);
+            sum = sum + 0.375f  * b.at(src_y    , right_x, src.data, src.step);
+            sum = sum + 0.25f   * b.at(src_y + 1, right_x, src.data, src.step);
+            sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
+
+            smem[4 + threadIdx.x] = sum;
+        }
+
+        __syncthreads();
+
+        if (threadIdx.x < 128)
+        {
+            const int tid2 = threadIdx.x * 2;
+
+            sum = VecTraits<value_type>::all(0);
+
+            sum = sum + 0.0625f * smem[2 + tid2 - 2];
+            sum = sum + 0.25f   * smem[2 + tid2 - 1];
+            sum = sum + 0.375f  * smem[2 + tid2    ];
+            sum = sum + 0.25f   * smem[2 + tid2 + 1];
+            sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+            const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
+
+            if (dst_x < dst_cols)
+                dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
+        }
+    }
+
+    template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        const dim3 block(256);
+        const dim3 grid(divUp(src.cols, block.x), dst.rows);
+
+        B<T> b(src.rows, src.cols);
+
+        pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
+    {
+        typedef typename TypeVec<T, cn>::vec_type type;
+
+        typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
+
+        static const caller_t callers[] = 
+        {
+            pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
+        };
+
+        callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
+    }
+
+    template void pyrDown_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+
+    template void pyrDown_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+
+    template void pyrDown_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+
+    template void pyrDown_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+
+    template void pyrDown_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+
+    template void pyrDown_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+}}}
diff --git a/modules/gpu/src/cuda/pyr_up.cu b/modules/gpu/src/cuda/pyr_up.cu
new file mode 100644
index 0000000000..42d30c68b5
--- /dev/null
+++ b/modules/gpu/src/cuda/pyr_up.cu
@@ -0,0 +1,180 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace cv { namespace gpu { namespace imgproc
+{
+    template <typename T, typename B> __global__ void pyrUp(const PtrStep_<T> src, DevMem2D_<T> dst, const B b)
+    {
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        __shared__ T smem1[10][10];
+        __shared__ value_type smem2[20][16];
+
+        value_type sum;
+
+        if (threadIdx.x < 10 && threadIdx.y < 10)
+            smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
+
+        __syncthreads();
+
+        const int tidx = threadIdx.x;
+
+        sum = VecTraits<value_type>::all(0);
+
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
+        sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[1 + threadIdx.y / 2][1 + ((tidx    ) >> 1)];
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
+
+        smem2[2 + threadIdx.y][tidx] = sum;
+
+        if (threadIdx.y < 2)
+        {
+            sum = VecTraits<value_type>::all(0);
+
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx - 1) >> 1)];
+            sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[0][1 + ((tidx    ) >> 1)];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx + 1) >> 1)];
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
+
+            smem2[threadIdx.y][tidx] = sum;
+        }
+
+        if (threadIdx.y > 13)
+        {
+            sum = VecTraits<value_type>::all(0);
+
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx - 1) >> 1)];
+            sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[9][1 + ((tidx    ) >> 1)];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx + 1) >> 1)];
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
+
+            smem2[4 + threadIdx.y][tidx] = sum;
+        }
+
+        __syncthreads();
+
+        sum = VecTraits<value_type>::all(0);
+
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y - 1][tidx];
+        sum = sum + (tidx % 2 == 0) * 0.375f  * smem2[2 + threadIdx.y    ][tidx];
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y + 1][tidx];
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
+
+        if (x < dst.cols && y < dst.rows)
+            dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
+    }
+
+    template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        B<T> b(src.rows, src.cols);
+
+        pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
+    {
+        typedef typename TypeVec<T, cn>::vec_type type;
+
+        typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
+
+        static const caller_t callers[] = 
+        {
+            pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
+        };
+
+        callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
+    }
+
+    template void pyrUp_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+
+    template void pyrUp_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+
+    template void pyrUp_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+
+    template void pyrUp_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+
+    template void pyrUp_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+
+    template void pyrUp_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+}}}
diff --git a/modules/gpu/src/cuda/remap.cu b/modules/gpu/src/cuda/remap.cu
new file mode 100644
index 0000000000..fb342a8364
--- /dev/null
+++ b/modules/gpu/src/cuda/remap.cu
@@ -0,0 +1,249 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/filters.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace cv { namespace gpu { namespace imgproc
+{
+    
+    template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
+    {
+        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < dst.cols && y < dst.rows)
+        {
+            const float xcoo = mapx.ptr(y)[x];
+            const float ycoo = mapy.ptr(y)[x];
+
+            dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
+        }
+    }
+
+    template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
+    {
+        static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
+            
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+            BorderReader< PtrStep_<T>, B<work_type> > brdSrc(src, brd);
+            Filter< BorderReader< PtrStep_<T>, B<work_type> > > filter_src(brdSrc);
+
+            remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
+            cudaSafeCall( cudaGetLastError() );
+        }
+    };
+    
+    template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
+    {
+        static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
+            
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+            BorderReader< PtrStep_<T>, B<work_type> > brdSrc(src, brd);
+            Filter< BorderReader< PtrStep_<T>, B<work_type> > > filter_src(brdSrc);
+
+            remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
+    texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+    struct tex_remap_ ## type ## _reader \
+    { \
+        typedef type elem_type; \
+        typedef int index_type; \
+        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+        { \
+            return tex2D(tex_remap_ ## type , x, y); \
+        } \
+    }; \
+    template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
+    { \
+        static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue) \
+        { \
+            typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+            dim3 block(32, 8); \
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+            TextureBinder texHandler(&tex_remap_ ## type , src); \
+            tex_remap_ ## type ##_reader texSrc; \
+            B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+            BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+            Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+            remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+            cudaSafeCall( cudaGetLastError() ); \
+            cudaSafeCall( cudaDeviceSynchronize() ); \
+        } \
+    }; \
+    template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
+    { \
+        static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*) \
+        { \
+            dim3 block(32, 8); \
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+            TextureBinder texHandler(&tex_remap_ ## type , src); \
+            tex_remap_ ## type ##_reader texSrc; \
+            Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
+            remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+            cudaSafeCall( cudaGetLastError() ); \
+            cudaSafeCall( cudaDeviceSynchronize() ); \
+        } \
+    };
+    
+    OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
+    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
+    OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
+    
+    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
+    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
+    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
+    
+    OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
+    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
+    OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
+    
+    OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
+    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
+    OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
+    
+    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
+    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
+    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
+    
+    OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
+    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
+    OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
+    
+#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
+
+    template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
+    { 
+        static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream)
+        {
+            if (stream == 0)
+                RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue);
+            else
+                RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream);
+        }
+    };
+
+    template <typename T> void remap_gpu(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream);
+
+        static const caller_t callers[3][5] = 
+        {
+            { 
+                RemapDispatcher<PointFilter, BrdReflect101, T>::call, 
+                RemapDispatcher<PointFilter, BrdReplicate, T>::call, 
+                RemapDispatcher<PointFilter, BrdConstant, T>::call, 
+                RemapDispatcher<PointFilter, BrdReflect, T>::call, 
+                RemapDispatcher<PointFilter, BrdWrap, T>::call 
+            },
+            { 
+                RemapDispatcher<LinearFilter, BrdReflect101, T>::call, 
+                RemapDispatcher<LinearFilter, BrdReplicate, T>::call, 
+                RemapDispatcher<LinearFilter, BrdConstant, T>::call, 
+                RemapDispatcher<LinearFilter, BrdReflect, T>::call, 
+                RemapDispatcher<LinearFilter, BrdWrap, T>::call 
+            },
+            { 
+                RemapDispatcher<CubicFilter, BrdReflect101, T>::call, 
+                RemapDispatcher<CubicFilter, BrdReplicate, T>::call, 
+                RemapDispatcher<CubicFilter, BrdConstant, T>::call, 
+                RemapDispatcher<CubicFilter, BrdReflect, T>::call, 
+                RemapDispatcher<CubicFilter, BrdWrap, T>::call 
+            }
+        };
+
+        callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream);
+    }
+
+    template void remap_gpu<uchar >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    //template void remap_gpu<uchar2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void remap_gpu<uchar3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void remap_gpu<uchar4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    
+    //template void remap_gpu<schar>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    //template void remap_gpu<char2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    //template void remap_gpu<char3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    //template void remap_gpu<char4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    
+    template void remap_gpu<ushort >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    //template void remap_gpu<ushort2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void remap_gpu<ushort3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void remap_gpu<ushort4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    
+    template void remap_gpu<short >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    //template void remap_gpu<short2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void remap_gpu<short3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void remap_gpu<short4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    
+    //template void remap_gpu<int >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    //template void remap_gpu<int2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    //template void remap_gpu<int3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    //template void remap_gpu<int4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    
+    template void remap_gpu<float >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    //template void remap_gpu<float2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void remap_gpu<float3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void remap_gpu<float4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
+}}}
diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu
new file mode 100644
index 0000000000..60f52c9431
--- /dev/null
+++ b/modules/gpu/src/cuda/resize.cu
@@ -0,0 +1,264 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/filters.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace cv { namespace gpu { namespace imgproc
+{
+    
+    template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
+    {
+        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < dst.cols && y < dst.rows)
+        {
+            const float xcoo = x / fx;
+            const float ycoo = y / fy;
+
+            dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
+        }
+    }
+    template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
+    {
+        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < dst.cols && y < dst.rows)
+        {
+            const float xcoo = x / fx;
+            const float ycoo = y / fy;
+
+            dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
+        }
+    }
+
+    template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
+    {
+        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
+        {            
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            BrdReplicate<T> brd(src.rows, src.cols);
+            BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
+            Filter< BorderReader< PtrStep_<T>, BrdReplicate<T> > > filter_src(brdSrc);
+
+            resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
+            cudaSafeCall( cudaGetLastError() );
+        }
+    };
+    template <typename T> struct ResizeDispatcherStream<PointFilter, T>
+    {
+        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
+        {            
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            BrdReplicate<T> brd(src.rows, src.cols);
+            BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
+
+            resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
+            cudaSafeCall( cudaGetLastError() );
+        }
+    };
+    
+    template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
+    {
+        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
+        {            
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            BrdReplicate<T> brd(src.rows, src.cols);
+            BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
+            Filter< BorderReader< PtrStep_<T>, BrdReplicate<T> > > filter_src(brdSrc);
+
+            resize<<<grid, block>>>(filter_src, fx, fy, dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
+    {
+        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
+        {            
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            BrdReplicate<T> brd(src.rows, src.cols);
+            BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
+
+            resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
+    texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+    struct tex_resize_ ## type ## _reader \
+    { \
+        typedef type elem_type; \
+        typedef int index_type; \
+        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+        { \
+            return tex2D(tex_resize_ ## type , x, y); \
+        } \
+    }; \
+    template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
+    { \
+        static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
+        { \
+            dim3 block(32, 8); \
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+            TextureBinder texHandler(&tex_resize_ ## type , src); \
+            tex_resize_ ## type ##_reader texSrc; \
+            Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
+            resize<<<grid, block>>>(filter_src, fx, fy, dst); \
+            cudaSafeCall( cudaGetLastError() ); \
+            cudaSafeCall( cudaDeviceSynchronize() ); \
+        } \
+    }; \
+    template <> struct ResizeDispatcherNonStream<PointFilter, type> \
+    { \
+        static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
+        { \
+            dim3 block(32, 8); \
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+            TextureBinder texHandler(&tex_resize_ ## type , src); \
+            tex_resize_ ## type ##_reader texSrc; \
+            resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
+            cudaSafeCall( cudaGetLastError() ); \
+            cudaSafeCall( cudaDeviceSynchronize() ); \
+        } \
+    };
+    
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
+    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
+    
+    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
+    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
+    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
+    
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
+    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
+    
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
+    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
+    
+    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
+    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
+    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
+    
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
+    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
+    
+#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
+
+    template <template <typename> class Filter, typename T> struct ResizeDispatcher
+    { 
+        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
+        {
+            if (stream == 0)
+                ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
+            else
+                ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
+        }
+    };
+
+    template <typename T> void resize_gpu(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
+
+        static const caller_t callers[3] = 
+        {
+            ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
+        };
+
+        callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
+    }
+
+    template void resize_gpu<uchar >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    //template void resize_gpu<uchar2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    template void resize_gpu<uchar3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    template void resize_gpu<uchar4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    
+    //template void resize_gpu<schar>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    //template void resize_gpu<char2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    //template void resize_gpu<char3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    //template void resize_gpu<char4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    
+    template void resize_gpu<ushort >(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    //template void resize_gpu<ushort2>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    template void resize_gpu<ushort3>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    template void resize_gpu<ushort4>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    
+    template void resize_gpu<short >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    //template void resize_gpu<short2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    template void resize_gpu<short3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    template void resize_gpu<short4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    
+    //template void resize_gpu<int >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    //template void resize_gpu<int2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    //template void resize_gpu<int3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    //template void resize_gpu<int4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    
+    template void resize_gpu<float >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    //template void resize_gpu<float2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    template void resize_gpu<float3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+    template void resize_gpu<float4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+}}}
diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu
new file mode 100644
index 0000000000..aa403721ae
--- /dev/null
+++ b/modules/gpu/src/cuda/row_filter.cu
@@ -0,0 +1,256 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+#define MAX_KERNEL_SIZE 16
+#define BLOCK_DIM_X 16
+#define BLOCK_DIM_Y 16
+
+namespace filter_krnls_row
+{
+    __constant__ float cLinearKernel[MAX_KERNEL_SIZE];
+
+    void loadLinearKernel(const float kernel[], int ksize)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(cLinearKernel, kernel, ksize * sizeof(float)) );
+    }
+
+    template <typename T, size_t size> struct SmemType_
+    {
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
+    };
+    template <typename T> struct SmemType_<T, 4>
+    {
+        typedef T smem_t;
+    };
+    template <typename T> struct SmemType
+    {
+        typedef typename SmemType_<T, sizeof(T)>::smem_t smem_t;
+    };
+
+    template <int ksize, typename T, typename D, typename B>
+    __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
+    {
+        typedef typename SmemType<T>::smem_t smem_t;
+
+        __shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
+
+        const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
+        const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
+
+        smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3;
+
+        if (y < src.rows)
+        {
+            const T* rowSrc = src.ptr(y);
+
+            sDataRow[threadIdx.x                  ] = b.at_low(x - BLOCK_DIM_X, rowSrc);
+            sDataRow[threadIdx.x + BLOCK_DIM_X    ] = b.at_high(x, rowSrc);
+            sDataRow[threadIdx.x + BLOCK_DIM_X * 2] = b.at_high(x + BLOCK_DIM_X, rowSrc);
+
+            __syncthreads();
+
+            if (x < src.cols)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+                sum_t sum = VecTraits<sum_t>::all(0);
+
+                sDataRow += threadIdx.x + BLOCK_DIM_X - anchor;
+
+                #pragma unroll
+                for(int i = 0; i < ksize; ++i)
+                    sum = sum + sDataRow[i] * cLinearKernel[i];
+
+                dst.ptr(y)[x] = saturate_cast<D>(sum);
+            }
+        }
+    }
+}
+
+namespace cv { namespace gpu { namespace filters
+{
+    template <int ksize, typename T, typename D, template<typename> class B>
+    void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
+    {
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+        dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
+
+        typedef typename filter_krnls_row::SmemType<T>::smem_t smem_t;
+        B<smem_t> b(src.cols);
+
+        if (!b.is_range_safe(-BLOCK_DIM_X, (grid.x + 1) * BLOCK_DIM_X - 1))
+        {
+            cv::gpu::error("linearRowFilter: can't use specified border extrapolation, image is too small, "
+                           "try bigger image or another border extrapolation mode", __FILE__, __LINE__);
+        }
+
+        filter_krnls_row::linearRowFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T, typename D>
+    void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
+        static const caller_t callers[5][17] = 
+        {
+            {
+                0, 
+                linearRowFilter_caller<1 , T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
+                linearRowFilter_caller<3 , T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<4 , T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<5 , T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<6 , T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
+                linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
+                linearRowFilter_caller<9 , T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<10, T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<11, T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<12, T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<13, T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<14, T, D, BrdRowReflect101>,
+                linearRowFilter_caller<15, T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<16, T, D, BrdRowReflect101>
+            },
+            {
+                0, 
+                linearRowFilter_caller<1 , T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
+                linearRowFilter_caller<3 , T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<4 , T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<5 , T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<6 , T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<7 , T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
+                linearRowFilter_caller<9 , T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<10, T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<11, T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<12, T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<13, T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<14, T, D, BrdRowReplicate>,
+                linearRowFilter_caller<15, T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<16, T, D, BrdRowReplicate>
+            },
+            {
+                0, 
+                linearRowFilter_caller<1 , T, D, BrdRowConstant>, 
+                linearRowFilter_caller<2 , T, D, BrdRowConstant>,
+                linearRowFilter_caller<3 , T, D, BrdRowConstant>, 
+                linearRowFilter_caller<4 , T, D, BrdRowConstant>, 
+                linearRowFilter_caller<5 , T, D, BrdRowConstant>, 
+                linearRowFilter_caller<6 , T, D, BrdRowConstant>, 
+                linearRowFilter_caller<7 , T, D, BrdRowConstant>, 
+                linearRowFilter_caller<8 , T, D, BrdRowConstant>,
+                linearRowFilter_caller<9 , T, D, BrdRowConstant>,
+                linearRowFilter_caller<10, T, D, BrdRowConstant>, 
+                linearRowFilter_caller<11, T, D, BrdRowConstant>, 
+                linearRowFilter_caller<12, T, D, BrdRowConstant>, 
+                linearRowFilter_caller<13, T, D, BrdRowConstant>,
+                linearRowFilter_caller<14, T, D, BrdRowConstant>,
+                linearRowFilter_caller<15, T, D, BrdRowConstant>, 
+                linearRowFilter_caller<16, T, D, BrdRowConstant>
+            },
+            {
+                0, 
+                linearRowFilter_caller<1 , T, D, BrdRowReflect>, 
+                linearRowFilter_caller<2 , T, D, BrdRowReflect>,
+                linearRowFilter_caller<3 , T, D, BrdRowReflect>, 
+                linearRowFilter_caller<4 , T, D, BrdRowReflect>, 
+                linearRowFilter_caller<5 , T, D, BrdRowReflect>, 
+                linearRowFilter_caller<6 , T, D, BrdRowReflect>, 
+                linearRowFilter_caller<7 , T, D, BrdRowReflect>, 
+                linearRowFilter_caller<8 , T, D, BrdRowReflect>,
+                linearRowFilter_caller<9 , T, D, BrdRowReflect>,
+                linearRowFilter_caller<10, T, D, BrdRowReflect>, 
+                linearRowFilter_caller<11, T, D, BrdRowReflect>, 
+                linearRowFilter_caller<12, T, D, BrdRowReflect>, 
+                linearRowFilter_caller<13, T, D, BrdRowReflect>,
+                linearRowFilter_caller<14, T, D, BrdRowReflect>,
+                linearRowFilter_caller<15, T, D, BrdRowReflect>, 
+                linearRowFilter_caller<16, T, D, BrdRowReflect>
+            },
+            {
+                0, 
+                linearRowFilter_caller<1 , T, D, BrdRowWrap>, 
+                linearRowFilter_caller<2 , T, D, BrdRowWrap>,
+                linearRowFilter_caller<3 , T, D, BrdRowWrap>, 
+                linearRowFilter_caller<4 , T, D, BrdRowWrap>, 
+                linearRowFilter_caller<5 , T, D, BrdRowWrap>, 
+                linearRowFilter_caller<6 , T, D, BrdRowWrap>, 
+                linearRowFilter_caller<7 , T, D, BrdRowWrap>, 
+                linearRowFilter_caller<8 , T, D, BrdRowWrap>,
+                linearRowFilter_caller<9 , T, D, BrdRowWrap>,
+                linearRowFilter_caller<10, T, D, BrdRowWrap>, 
+                linearRowFilter_caller<11, T, D, BrdRowWrap>, 
+                linearRowFilter_caller<12, T, D, BrdRowWrap>, 
+                linearRowFilter_caller<13, T, D, BrdRowWrap>,
+                linearRowFilter_caller<14, T, D, BrdRowWrap>,
+                linearRowFilter_caller<15, T, D, BrdRowWrap>, 
+                linearRowFilter_caller<16, T, D, BrdRowWrap>
+            }
+        };
+        
+        filter_krnls_row::loadLinearKernel(kernel, ksize);
+
+        callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
+    }
+
+    template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    //template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    //template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    template void linearRowFilter_gpu<short3, float3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    template void linearRowFilter_gpu<int   , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+}}}
diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp
index 0144d4b4f8..de7617fc94 100644
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -722,7 +722,7 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
     int gpuBorderType;
     CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
 
-    CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_16SC1 || srcType == CV_16SC2 
+    CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 /*|| srcType == CV_16SC1*/ /*|| srcType == CV_16SC2*/ 
         || srcType == CV_16SC3 || srcType == CV_32SC1 || srcType == CV_32FC1);
 
     CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType));
@@ -747,12 +747,12 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
     case CV_8UC4:
         func = filters::linearRowFilter_gpu<uchar4, float4>;
         break;
-    case CV_16SC1:
+    /*case CV_16SC1:
         func = filters::linearRowFilter_gpu<short, float>;
-        break;
-    case CV_16SC2:
+        break;*/
+    /*case CV_16SC2:
         func = filters::linearRowFilter_gpu<short2, float2>;
-        break;
+        break;*/
     case CV_16SC3:
         func = filters::linearRowFilter_gpu<short3, float3>;
         break;
@@ -837,7 +837,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
     int gpuBorderType;
     CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
    
-    CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 || dstType == CV_16SC1 || dstType == CV_16SC2
+    CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 /*|| dstType == CV_16SC1*/ /*|| dstType == CV_16SC2*/
         || dstType == CV_16SC3 || dstType == CV_32SC1 || dstType == CV_32FC1);
 
     CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType));
@@ -862,12 +862,12 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
     case CV_8UC4:
         func = filters::linearColumnFilter_gpu<float4, uchar4>;
         break;
-    case CV_16SC1:
+    /*case CV_16SC1:
         func = filters::linearColumnFilter_gpu<float, short>;
-        break;
-    case CV_16SC2:
+        break;*/
+    /*case CV_16SC2:
         func = filters::linearColumnFilter_gpu<float2, short2>;
-        break;
+        break;*/
     case CV_16SC3:
         func = filters::linearColumnFilter_gpu<float3, short3>;
         break;
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index 6eda811a41..a372fcd040 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -90,8 +90,6 @@ void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int) { throw_nogpu(); }
 void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); }
-void cv::gpu::downsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::upsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
@@ -120,17 +118,20 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
     typedef void (*caller_t)(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
     static const caller_t callers[6][4] = 
     {
-        {remap_gpu<uchar>, remap_gpu<uchar2>, remap_gpu<uchar3>, remap_gpu<uchar4>},
-        {remap_gpu<schar>, remap_gpu<char2>, remap_gpu<char3>, remap_gpu<char4>},
-        {remap_gpu<ushort>, remap_gpu<ushort2>, remap_gpu<ushort3>, remap_gpu<ushort4>},
-        {remap_gpu<short>, remap_gpu<short2>, remap_gpu<short3>, remap_gpu<short4>},
-        {remap_gpu<int>, remap_gpu<int2>, remap_gpu<int3>, remap_gpu<int4>},
-        {remap_gpu<float>, remap_gpu<float2>, remap_gpu<float3>, remap_gpu<float4>}
+        {remap_gpu<uchar>, 0/*remap_gpu<uchar2>*/, remap_gpu<uchar3>, remap_gpu<uchar4>},
+        {0/*remap_gpu<schar>*/, 0/*remap_gpu<char2>*/, 0/*remap_gpu<char3>*/, 0/*remap_gpu<char4>*/},
+        {remap_gpu<ushort>, 0/*remap_gpu<ushort2>*/, remap_gpu<ushort3>, remap_gpu<ushort4>},
+        {remap_gpu<short>, 0/*remap_gpu<short2>*/, remap_gpu<short3>, remap_gpu<short4>},
+        {0/*remap_gpu<int>*/, 0/*remap_gpu<int2>*/, 0/*remap_gpu<int3>*/, 0/*remap_gpu<int4>*/},
+        {remap_gpu<float>, 0/*remap_gpu<float2>*/, remap_gpu<float3>, remap_gpu<float4>}
     };
 
     CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
     CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
 
+    caller_t func = callers[src.depth()][src.channels() - 1];
+    CV_Assert(func != 0);
+
     CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
 
     CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
@@ -142,7 +143,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
     Scalar_<float> borderValueFloat;
     borderValueFloat = borderValue;
 
-    callers[src.depth()][src.channels() - 1](src, xmap, ymap, dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream));
+    func(src, xmap, ymap, dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -279,19 +280,6 @@ namespace cv { namespace gpu {  namespace imgproc
 
 void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
 {
-    using namespace cv::gpu::imgproc;
-
-    typedef void (*caller_t)(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    static const caller_t callers[6][4] = 
-    {
-        {resize_gpu<uchar>, resize_gpu<uchar2>, resize_gpu<uchar3>, resize_gpu<uchar4>},
-        {resize_gpu<schar>, resize_gpu<char2>, resize_gpu<char3>, resize_gpu<char4>},
-        {resize_gpu<ushort>, resize_gpu<ushort2>, resize_gpu<ushort3>, resize_gpu<ushort4>},
-        {resize_gpu<short>, resize_gpu<short2>, resize_gpu<short3>, resize_gpu<short4>},
-        {resize_gpu<int>, resize_gpu<int2>, resize_gpu<int3>, resize_gpu<int4>},
-        {resize_gpu<float>, resize_gpu<float2>, resize_gpu<float3>, resize_gpu<float4>}
-    };
-
     CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
     CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
     CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) );
@@ -352,6 +340,19 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
     }
     else
     {
+        using namespace cv::gpu::imgproc;
+
+        typedef void (*caller_t)(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+        static const caller_t callers[6][4] = 
+        {
+            {resize_gpu<uchar>, 0/*resize_gpu<uchar2>*/, resize_gpu<uchar3>, resize_gpu<uchar4>},
+            {0/*resize_gpu<schar>*/, 0/*resize_gpu<char2>*/, 0/*resize_gpu<char3>*/, 0/*resize_gpu<char4>*/},
+            {resize_gpu<ushort>, 0/*resize_gpu<ushort2>*/, resize_gpu<ushort3>, resize_gpu<ushort4>},
+            {resize_gpu<short>, 0/*resize_gpu<short2>*/, resize_gpu<short3>, resize_gpu<short4>},
+            {0/*resize_gpu<int>*/, 0/*resize_gpu<int2>*/, 0/*resize_gpu<int3>*/, 0/*resize_gpu<int4>*/},
+            {resize_gpu<float>, 0/*resize_gpu<float2>*/, resize_gpu<float3>, resize_gpu<float4>}
+        };
+
         callers[src.depth()][src.channels() - 1](src, static_cast<float>(fx), static_cast<float>(fy), dst, interpolation, stream);
     }
 }
@@ -1589,75 +1590,6 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
     cufftSafeCall(cufftDestroy(planC2R));
 }
 
-
-////////////////////////////////////////////////////////////////////
-// downsample
-
-namespace cv { namespace gpu { namespace imgproc
-{
-    template <typename T, int cn>
-    void downsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-}}}
-
-
-void cv::gpu::downsample(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    CV_Assert(src.depth() < CV_64F && src.channels() <= 4);
-
-    typedef void (*Caller)(const DevMem2D, DevMem2D, cudaStream_t stream);
-    static const Caller callers[6][4] =
-        {{imgproc::downsampleCaller<uchar,1>, imgproc::downsampleCaller<uchar,2>,
-          imgproc::downsampleCaller<uchar,3>, imgproc::downsampleCaller<uchar,4>},
-         {0,0,0,0}, {0,0,0,0},
-         {imgproc::downsampleCaller<short,1>, imgproc::downsampleCaller<short,2>,
-          imgproc::downsampleCaller<short,3>, imgproc::downsampleCaller<short,4>},
-         {0,0,0,0},
-         {imgproc::downsampleCaller<float,1>, imgproc::downsampleCaller<float,2>,
-          imgproc::downsampleCaller<float,3>, imgproc::downsampleCaller<float,4>}};
-
-    Caller caller = callers[src.depth()][src.channels()-1];
-    if (!caller)
-        CV_Error(CV_StsUnsupportedFormat, "bad number of channels");
-
-    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
-    caller(src, dst.reshape(1), StreamAccessor::getStream(stream));
-}
-
-
-//////////////////////////////////////////////////////////////////////////////
-// upsample
-
-namespace cv { namespace gpu { namespace imgproc
-{
-    template <typename T, int cn>
-    void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-}}}
-
-
-void cv::gpu::upsample(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    CV_Assert(src.depth() < CV_64F && src.channels() <= 4);
-
-    typedef void (*Caller)(const DevMem2D, DevMem2D, cudaStream_t stream);
-    static const Caller callers[6][5] =
-        {{imgproc::upsampleCaller<uchar,1>, imgproc::upsampleCaller<uchar,2>,
-          imgproc::upsampleCaller<uchar,3>, imgproc::upsampleCaller<uchar,4>},
-         {0,0,0,0}, {0,0,0,0},
-         {imgproc::upsampleCaller<short,1>, imgproc::upsampleCaller<short,2>,
-          imgproc::upsampleCaller<short,3>, imgproc::upsampleCaller<short,4>},
-         {0,0,0,0},
-         {imgproc::upsampleCaller<float,1>, imgproc::upsampleCaller<float,2>,
-          imgproc::upsampleCaller<float,3>, imgproc::upsampleCaller<float,4>}};
-
-    Caller caller = callers[src.depth()][src.channels()-1];
-    if (!caller)
-        CV_Error(CV_StsUnsupportedFormat, "bad number of channels");
-
-    dst.create(src.rows*2, src.cols*2, src.type());
-    caller(src, dst.reshape(1), StreamAccessor::getStream(stream));
-}
-
-
 //////////////////////////////////////////////////////////////////////////////
 // pyrDown
 
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
index de3f3f7287..229fe04e50 100644
--- a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
@@ -47,9 +47,6 @@ namespace cv { namespace gpu { namespace device
 {
     namespace detail
     {
-        ///////////////////////////////////////////////////////////////////////////////
-        // Reduction
-
         template <int n> struct WarpReductor
         {
             template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
@@ -504,72 +501,6 @@ namespace cv { namespace gpu { namespace device
                 }
             }
         };
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Vector Distance
-
-        template <int THREAD_DIM, int N> struct UnrollVecDiffCached
-        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
-            {
-                if (ind < len)
-                {
-                    T1 val1 = *vecCached++;
-
-                    T2 val2;
-                    ForceGlob<T2>::Load(vecGlob, ind, val2);
-
-                    dist.reduceIter(val1, val2);
-
-                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
-                }
-            }
-
-            template <typename Dist, typename T1, typename T2>
-            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
-            {
-                T1 val1 = *vecCached++;
-
-                T2 val2;
-                ForceGlob<T2>::Load(vecGlob, 0, val2);
-                vecGlob += THREAD_DIM;
-
-                dist.reduceIter(val1, val2);
-
-                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
-            }
-        };
-        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
-        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
-            {
-            }
-
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
-            {
-            }
-        };
-
-        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
-        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
-        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
-            {
-                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
-            }
-        };
-        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
-        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
-            {
-                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
-            }
-        };
     }
 }}}
 
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
new file mode 100644
index 0000000000..5171654fe3
--- /dev/null
+++ b/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
@@ -0,0 +1,117 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
+#define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
+
+#include "../datamov_utils.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace detail
+    {
+        template <int THREAD_DIM, int N> struct UnrollVecDiffCached
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
+            {
+                if (ind < len)
+                {
+                    T1 val1 = *vecCached++;
+
+                    T2 val2;
+                    ForceGlob<T2>::Load(vecGlob, ind, val2);
+
+                    dist.reduceIter(val1, val2);
+
+                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
+                }
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
+            {
+                T1 val1 = *vecCached++;
+
+                T2 val2;
+                ForceGlob<T2>::Load(vecGlob, 0, val2);
+                vecGlob += THREAD_DIM;
+
+                dist.reduceIter(val1, val2);
+
+                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
+            }
+        };
+        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
+            {
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
+            {
+            }
+        };
+
+        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
+            }
+        };
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
+            }
+        };
+    }
+}}}
+
+#endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp
index 5b49c4d516..031fd7fa29 100644
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@@ -46,7 +46,6 @@
 #include "internal_shared.hpp"
 #include "saturate_cast.hpp"
 #include "datamov_utils.hpp"
-#include "functional.hpp"
 #include "detail/utility_detail.hpp"
 
 #define OPENCV_GPU_LOG_WARP_SIZE	    (5)
diff --git a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
index 064b74b17b..5831ae556a 100644
--- a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
@@ -43,7 +43,10 @@
 #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
 #define __OPENCV_GPU_VEC_DISTANCE_HPP__
 
+#include "internal_shared.hpp"
 #include "utility.hpp"
+#include "functional.hpp"
+#include "detail/vec_distance_detail.hpp"
 
 namespace cv {  namespace gpu { namespace device
 {