diff --git a/modules/gpu/include/opencv2/gpu/devmem2d.hpp b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
index 443330eed4..80290ed2e9 100644
--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@@ -50,7 +50,7 @@ namespace cv
         // Simple lightweight structure that encapsulates image ptr on device, its pitch and its sizes.
         // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
 
-        template<typename T = unsigned char>
+        template <typename T>
         struct DevMem2D_
         {
             typedef T elem_t;
@@ -60,16 +60,21 @@ namespace cv
             int rows;
             T* ptr;
             size_t step;
+            size_t elem_step;
 
-            DevMem2D_() : cols(0), rows(0), ptr(0), step(0) {}
+            DevMem2D_() : cols(0), rows(0), ptr(0), step(0), elem_step(0) {}
 
             DevMem2D_(int rows_, int cols_, T *ptr_, size_t step_)
-                : cols(cols_), rows(rows_), ptr(ptr_), step(step_) {}
+                : cols(cols_), rows(rows_), ptr(ptr_), step(step_), elem_step(step_ / sizeof(T)) {}
+            
+            template <typename U>
+            explicit DevMem2D_(const DevMem2D_<U>& d)
+                : cols(d.cols), rows(d.rows), ptr((T*)d.ptr), step(d.step), elem_step(d.step / sizeof(T)) {}
 
             size_t elemSize() const { return elem_size; }
         };
 
-        typedef DevMem2D_<> DevMem2D;
+        typedef DevMem2D_<unsigned char> DevMem2D;
         typedef DevMem2D_<float> DevMem2Df;
         typedef DevMem2D_<int> DevMem2Di;
     }
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 93a7304fd9..1146de400b 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -636,7 +636,7 @@ namespace cv
 
         //! returns the separable filter engine with the specified filters
         CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter, 
-            const Ptr<BaseColumnFilter_GPU>& columnFilter, bool rowFilterFirst = true);
+            const Ptr<BaseColumnFilter_GPU>& columnFilter);
 
         //! returns horizontal 1D box filter
         //! supports only CV_8UC1 source type and CV_32FC1 sum type
@@ -658,7 +658,7 @@ namespace cv
         //! only MORPH_ERODE and MORPH_DILATE are supported
         //! supports CV_8UC1 and CV_8UC4 types
         //! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
-        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const GpuMat& kernel, const Size& ksize, 
+        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, 
             Point anchor=Point(-1,-1));
 
         //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
@@ -667,25 +667,24 @@ namespace cv
 
         //! returns 2D filter with the specified kernel
         //! supports CV_8UC1 and CV_8UC4 types
-        //! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
-        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const GpuMat& kernel, const Size& ksize, 
-            Point anchor = Point(-1, -1), int nDivisor = 1);
+        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize, 
+            Point anchor = Point(-1, -1));
 
         //! returns the non-separable linear filter engine
         CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, 
             const Point& anchor = Point(-1,-1));
 
         //! returns the primitive row filter with the specified kernel
-        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const GpuMat& rowKernel, 
-            int anchor = -1, int nDivisor = 1);
+        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, 
+            int anchor = -1);
 
         //! returns the primitive column filter with the specified kernel
-        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const GpuMat& columnKernel, 
-            int anchor = -1, int nDivisor = 1);
+        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, 
+            int anchor = -1);
 
         //! returns the separable linear filter engine
         CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, 
-            const Mat& columnKernel, const Point& anchor = Point(-1,-1), bool rowFilterFirst = true);
+            const Mat& columnKernel, const Point& anchor = Point(-1,-1));
 
         //! returns filter engine for the generalized Sobel operator
         CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize);
@@ -720,7 +719,7 @@ namespace cv
 
         //! applies separable 2D linear filter to the image
         CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, 
-            Point anchor = Point(-1,-1), bool rowFilterFirst = true);
+            Point anchor = Point(-1,-1));
 
         //! applies generalized Sobel operator to the image
         CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1);
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp
index 2ed3f43289..b507cc660c 100644
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -316,9 +316,9 @@ void cv::gpu::absdiff(const GpuMat& src, const Scalar& s, GpuMat& dst)
 ////////////////////////////////////////////////////////////////////////
 // compare
 
-namespace cv { namespace gpu { namespace matrix_operations
+namespace cv { namespace gpu { namespace mathfunc
 {
-    void compare_ne_8u(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst);
+    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst);
     void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst);
 }}}
 
@@ -346,7 +346,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
         }
         else
         {
-            matrix_operations::compare_ne_8u(src1, src2, dst);
+            mathfunc::compare_ne_8uc4(src1, src2, dst);
         }
     }
     else
@@ -359,7 +359,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
         }
         else
         {
-            matrix_operations::compare_ne_32f(src1, src2, dst);
+            mathfunc::compare_ne_32f(src1, src2, dst);
         }
     }
 }
diff --git a/modules/gpu/src/cuda/bilateral_filter.cu b/modules/gpu/src/cuda/bilateral_filter.cu
deleted file mode 100644
index 132ca84ceb..0000000000
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ /dev/null
@@ -1,233 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "opencv2/gpu/devmem2d.hpp"
-#include "saturate_cast.hpp"
-#include "safe_call.hpp"
-
-using namespace cv::gpu;
-
-#ifndef FLT_MAX
-#define FLT_MAX 3.402823466e+30F
-#endif
-
-namespace bf_krnls
-{
-    __constant__ float* ctable_color;
-    __constant__ float* ctable_space;
-    __constant__ size_t ctable_space_step;
-
-    __constant__ int cndisp;
-    __constant__ int cradius;
-
-    __constant__ short cedge_disc;
-    __constant__ short cmax_disc;
-}
-
-namespace cv { namespace gpu { namespace bf 
-{
-    void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.ptr, sizeof(table_space.ptr)) );
-        size_t table_space_step = table_space.step / sizeof(float);
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
-        
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
-        
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
-    }
-}}}
-
-namespace bf_krnls
-{
-    template <int channels>
-    struct DistRgbMax
-    {
-        static __device__ uchar calc(const uchar* a, const uchar* b)
-        {
-            uchar x = abs(a[0] - b[0]);
-            uchar y = abs(a[1] - b[1]);
-            uchar z = abs(a[2] - b[2]);
-            return (max(max(x, y), z));
-        }
-    };
-
-    template <>
-    struct DistRgbMax<1>
-    {
-        static __device__ uchar calc(const uchar* a, const uchar* b)
-        {
-            return abs(a[0] - b[0]);
-        }
-    };
-
-    template <int channels, typename T>
-    __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
-    {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
-
-        T dp[5];
-
-        if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
-        {
-            dp[0] = *(disp + (y  ) * disp_step + x + 0);
-            dp[1] = *(disp + (y-1) * disp_step + x + 0);
-            dp[2] = *(disp + (y  ) * disp_step + x - 1);
-            dp[3] = *(disp + (y+1) * disp_step + x + 0);
-            dp[4] = *(disp + (y  ) * disp_step + x + 1);
-
-            if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc)            
-            {
-                const int ymin = max(0, y - cradius);
-                const int xmin = max(0, x - cradius);
-                const int ymax = min(h - 1, y + cradius);
-                const int xmax = min(w - 1, x + cradius);
-
-                float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-
-                const uchar* ic = img + y * img_step + channels * x;
-
-                for(int yi = ymin; yi <= ymax; yi++)
-                {
-                    const T* disp_y = disp + yi * disp_step;
-
-                    for(int xi = xmin; xi <= xmax; xi++)
-                    {
-                        const uchar* in = img + yi * img_step + channels * xi;
-
-                        uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
-
-                        const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)];
-
-                        const T disp_reg = disp_y[xi];
-
-                        cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight;
-                        cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight;
-                        cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight;
-                        cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight;
-                        cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight;
-                    }
-                }
-
-                float minimum = FLT_MAX;
-                int id = 0;
-
-                if (cost[0] < minimum)
-                {
-                    minimum = cost[0];
-                    id = 0;
-                }
-                if (cost[1] < minimum)
-                {
-                    minimum = cost[1];
-                    id = 1;
-                }
-                if (cost[2] < minimum)
-                {
-                    minimum = cost[2];
-                    id = 2;
-                }
-                if (cost[3] < minimum)
-                {
-                    minimum = cost[3];
-                    id = 3;
-                }
-                if (cost[4] < minimum)
-                {
-                    minimum = cost[4];
-                    id = 4;
-                }
-
-                *(disp + y * disp_step + x) = dp[id];
-            }
-        }
-    }
-}
-
-namespace cv { namespace gpu { namespace bf 
-{
-    template <typename T>     
-    void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-        grid.x = divUp(disp.cols, threads.x << 1);
-        grid.y = divUp(disp.rows, threads.y);
-
-        switch (channels)
-        {
-        case 1:
-            for (int i = 0; i < iters; ++i)
-            {
-                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols);
-                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols);
-            }
-            break;
-        case 3:
-            for (int i = 0; i < iters; ++i)
-            {
-                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols);
-                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols);
-            }
-            break;
-        default:
-            cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
-        }        
-
-        if (stream != 0)
-            cudaSafeCall( cudaThreadSynchronize() );
-    }
-
-    void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
-    {
-        bilateral_filter_caller(disp, img, channels, iters, stream);
-    }
-
-    void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
-    {
-        bilateral_filter_caller(disp, img, channels, iters, stream);
-    }
-}}}
diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu
index 699e285a70..e956ff8dd7 100644
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -42,6 +42,7 @@
 
 #include "cuda_shared.hpp"
 #include "saturate_cast.hpp"
+#include "vecmath.hpp"
 
 using namespace cv::gpu;
 
@@ -53,16 +54,8 @@ using namespace cv::gpu;
 #define FLT_EPSILON     1.192092896e-07F
 #endif
 
-namespace imgproc
+namespace imgproc_krnls
 {
-    template<typename T, int N> struct TypeVec {};
-    template<> struct TypeVec<uchar, 3> { typedef uchar3 vec_t; };
-    template<> struct TypeVec<uchar, 4> { typedef uchar4 vec_t; };
-    template<> struct TypeVec<ushort, 3> { typedef ushort3 vec_t; };
-    template<> struct TypeVec<ushort, 4> { typedef ushort4 vec_t; };
-    template<> struct TypeVec<float, 3> { typedef float3 vec_t; };
-    template<> struct TypeVec<float, 4> { typedef float4 vec_t; };
-
     template<typename T> struct ColorChannel {};
     template<> struct ColorChannel<uchar>
     {
@@ -106,7 +99,7 @@ namespace imgproc
 
 ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
 
-namespace imgproc
+namespace imgproc_krnls
 {
     template <int SRCCN, int DSTCN, typename T>
     __global__ void RGB2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
@@ -132,7 +125,7 @@ namespace imgproc
     }
 }
 
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
     template <typename T, int SRCCN, int DSTCN>
     void RGB2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
@@ -143,7 +136,7 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(src.cols, threads.x);
         grid.y = divUp(src.rows, threads.y);
 
-        imgproc::RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
             dst.ptr, dst.step, src.rows, src.cols, bidx);
 
         if (stream == 0)
@@ -189,7 +182,7 @@ namespace cv { namespace gpu { namespace improc
 
 /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
 
-namespace imgproc
+namespace imgproc_krnls
 {
     template <int GREEN_BITS, int DSTCN> struct RGB5x52RGBConverter {};    
     template <int DSTCN> struct RGB5x52RGBConverter<5, DSTCN>
@@ -281,7 +274,7 @@ namespace imgproc
     }
 }
 
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
     template <int GREEN_BITS, int DSTCN>
     void RGB5x52RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
@@ -292,7 +285,7 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(src.cols, threads.x);
         grid.y = divUp(src.rows, threads.y);
 
-        imgproc::RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
             dst.ptr, dst.step, src.rows, src.cols, bidx);
 
         if (stream == 0)
@@ -320,7 +313,7 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(src.cols, threads.x);
         grid.y = divUp(src.rows, threads.y);
 
-        imgproc::RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
             dst.ptr, dst.step, src.rows, src.cols, bidx);
 
         if (stream == 0)
@@ -342,7 +335,7 @@ namespace cv { namespace gpu { namespace improc
 
 ///////////////////////////////// Grayscale to Color ////////////////////////////////
 
-namespace imgproc
+namespace imgproc_krnls
 {
     template <int DSTCN, typename T>
     __global__ void Gray2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
@@ -396,7 +389,7 @@ namespace imgproc
     }
 }
 
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
     template <typename T, int DSTCN>
     void Gray2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
@@ -407,7 +400,7 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(src.cols, threads.x);
         grid.y = divUp(src.rows, threads.y);
 
-        imgproc::Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
             dst.ptr, dst.step, src.rows, src.cols);
 
         if (stream == 0)
@@ -447,7 +440,7 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(src.cols, threads.x);
         grid.y = divUp(src.rows, threads.y);
 
-        imgproc::Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
             dst.ptr, dst.step, src.rows, src.cols);
 
         if (stream == 0)
@@ -468,7 +461,7 @@ namespace cv { namespace gpu { namespace improc
 
 ///////////////////////////////// Color to Grayscale ////////////////////////////////
 
-namespace imgproc
+namespace imgproc_krnls
 {
     #undef R2Y
     #undef G2Y
@@ -550,7 +543,7 @@ namespace imgproc
     }   
 }
 
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
     template <typename T, int SRCCN>
     void RGB2Gray_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
@@ -561,7 +554,7 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(src.cols, threads.x);
         grid.y = divUp(src.rows, threads.y);
 
-        imgproc::RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
             dst.ptr, dst.step, src.rows, src.cols, bidx);
 
         if (stream == 0)
@@ -601,7 +594,7 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(src.cols, threads.x);
         grid.y = divUp(src.rows, threads.y);
 
-        imgproc::RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
             dst.ptr, dst.step, src.rows, src.cols);
 
         if (stream == 0)
@@ -622,7 +615,7 @@ namespace cv { namespace gpu { namespace improc
 
 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
 
-namespace imgproc
+namespace imgproc_krnls
 {
     __constant__ float cYCrCbCoeffs_f[5];
     __constant__ int cYCrCbCoeffs_i[5];
@@ -721,7 +714,7 @@ namespace imgproc
     }
 }
 
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
     template <typename T, int SRCCN, int DSTCN>
     void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
@@ -732,7 +725,7 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(src.cols, threads.x);
         grid.y = divUp(src.rows, threads.y);
 
-        imgproc::RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
             dst.ptr, dst.step, src.rows, src.cols, bidx);
 
         if (stream == 0)
@@ -748,7 +741,7 @@ namespace cv { namespace gpu { namespace improc
             {RGB2YCrCb_caller<uchar, 4, 3>, RGB2YCrCb_caller<uchar, 4, 4>}
         };
 
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
 
         RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
     }
@@ -762,7 +755,7 @@ namespace cv { namespace gpu { namespace improc
             {RGB2YCrCb_caller<ushort, 4, 3>, RGB2YCrCb_caller<ushort, 4, 4>}
         };
         
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
 
         RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
     }
@@ -776,7 +769,7 @@ namespace cv { namespace gpu { namespace improc
             {RGB2YCrCb_caller<float, 4, 3>, RGB2YCrCb_caller<float, 4, 4>}
         };
         
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) );
 
         RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
     }
@@ -790,7 +783,7 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(src.cols, threads.x);
         grid.y = divUp(src.rows, threads.y);
 
-        imgproc::YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
             dst.ptr, dst.step, src.rows, src.cols, bidx);
 
         if (stream == 0)
@@ -806,7 +799,7 @@ namespace cv { namespace gpu { namespace improc
             {YCrCb2RGB_caller<uchar, 4, 3>, YCrCb2RGB_caller<uchar, 4, 4>}
         };
 
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
 
         YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
     }
@@ -820,7 +813,7 @@ namespace cv { namespace gpu { namespace improc
             {YCrCb2RGB_caller<ushort, 4, 3>, YCrCb2RGB_caller<ushort, 4, 4>}
         };
         
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
 
         YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
     }
@@ -834,7 +827,7 @@ namespace cv { namespace gpu { namespace improc
             {YCrCb2RGB_caller<float, 4, 3>, YCrCb2RGB_caller<float, 4, 4>}
         };
         
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) );
 
         YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
     }
@@ -842,7 +835,7 @@ namespace cv { namespace gpu { namespace improc
 
 ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
 
-namespace imgproc
+namespace imgproc_krnls
 {
     __constant__ float cXYZ_D65f[9];
     __constant__ int cXYZ_D65i[9];
@@ -931,7 +924,7 @@ namespace imgproc
     }
 }
 
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
     template <typename T, int SRCCN, int DSTCN>
     void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
@@ -942,7 +935,7 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(src.cols, threads.x);
         grid.y = divUp(src.rows, threads.y);
 
-        imgproc::RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
             dst.ptr, dst.step, src.rows, src.cols);
 
         if (stream == 0)
@@ -958,7 +951,7 @@ namespace cv { namespace gpu { namespace improc
             {RGB2XYZ_caller<uchar, 4, 3>, RGB2XYZ_caller<uchar, 4, 4>}
         };
 
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
 
         RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
     }
@@ -972,7 +965,7 @@ namespace cv { namespace gpu { namespace improc
             {RGB2XYZ_caller<ushort, 4, 3>, RGB2XYZ_caller<ushort, 4, 4>}
         };
         
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
 
         RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
     }
@@ -986,7 +979,7 @@ namespace cv { namespace gpu { namespace improc
             {RGB2XYZ_caller<float, 4, 3>, RGB2XYZ_caller<float, 4, 4>}
         };
         
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65f, coeffs, 9 * sizeof(float)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65f, coeffs, 9 * sizeof(float)) );
 
         RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
     }
@@ -1000,7 +993,7 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(src.cols, threads.x);
         grid.y = divUp(src.rows, threads.y);
 
-        imgproc::XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
             dst.ptr, dst.step, src.rows, src.cols);
 
         if (stream == 0)
@@ -1016,7 +1009,7 @@ namespace cv { namespace gpu { namespace improc
             {XYZ2RGB_caller<uchar, 4, 3>, XYZ2RGB_caller<uchar, 4, 4>}
         };
 
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
 
         XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
     }
@@ -1030,7 +1023,7 @@ namespace cv { namespace gpu { namespace improc
             {XYZ2RGB_caller<ushort, 4, 3>, XYZ2RGB_caller<ushort, 4, 4>}
         };
         
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
 
         XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
     }
@@ -1044,7 +1037,7 @@ namespace cv { namespace gpu { namespace improc
             {XYZ2RGB_caller<float, 4, 3>, XYZ2RGB_caller<float, 4, 4>}
         };
         
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65f, coeffs, 9 * sizeof(float)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65f, coeffs, 9 * sizeof(float)) );
 
         XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
     }
@@ -1052,7 +1045,7 @@ namespace cv { namespace gpu { namespace improc
 
 ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
 
-namespace imgproc
+namespace imgproc_krnls
 {
     __constant__ int cHsvDivTable[256];
 
@@ -1229,7 +1222,7 @@ namespace imgproc
     }
 }
 
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
     template <typename T, int SRCCN, int DSTCN>
     void RGB2HSV_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
@@ -1241,10 +1234,10 @@ namespace cv { namespace gpu { namespace improc
         grid.y = divUp(src.rows, threads.y);
 
         if (hrange == 180)
-            imgproc::RGB2HSV<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::RGB2HSV<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                 dst.ptr, dst.step, src.rows, src.cols, bidx);
         else
-            imgproc::RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                 dst.ptr, dst.step, src.rows, src.cols, bidx);
 
         if (stream == 0)
@@ -1295,7 +1288,7 @@ namespace cv { namespace gpu { namespace improc
             4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229,
             4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096
         };
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHsvDivTable, div_table, sizeof(div_table)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHsvDivTable, div_table, sizeof(div_table)) );
 
         RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
     }
@@ -1323,10 +1316,10 @@ namespace cv { namespace gpu { namespace improc
         grid.y = divUp(src.rows, threads.y);
 
         if (hrange == 180)
-            imgproc::HSV2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::HSV2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                 dst.ptr, dst.step, src.rows, src.cols, bidx);
         else
-            imgproc::HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                 dst.ptr, dst.step, src.rows, src.cols, bidx);
 
         if (stream == 0)
@@ -1345,7 +1338,7 @@ namespace cv { namespace gpu { namespace improc
         static const int sector_data[][3] =
             {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
 
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHsvSectorData, sector_data, sizeof(sector_data)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHsvSectorData, sector_data, sizeof(sector_data)) );
 
         HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
     }
@@ -1362,7 +1355,7 @@ namespace cv { namespace gpu { namespace improc
         static const int sector_data[][3] =
             {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
 
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHsvSectorData, sector_data, sizeof(sector_data)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHsvSectorData, sector_data, sizeof(sector_data)) );
         
         HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
     }
@@ -1370,7 +1363,7 @@ namespace cv { namespace gpu { namespace improc
 
 /////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
 
-namespace imgproc
+namespace imgproc_krnls
 {
     template<typename T, int HR> struct RGB2HLSConvertor;
     template<int HR> struct RGB2HLSConvertor<float, HR>
@@ -1541,7 +1534,7 @@ namespace imgproc
     }
 }
 
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
     template <typename T, int SRCCN, int DSTCN>
     void RGB2HLS_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
@@ -1553,10 +1546,10 @@ namespace cv { namespace gpu { namespace improc
         grid.y = divUp(src.rows, threads.y);
 
         if (hrange == 180)
-            imgproc::RGB2HLS<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::RGB2HLS<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                 dst.ptr, dst.step, src.rows, src.cols, bidx);
         else
-            imgproc::RGB2HLS<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::RGB2HLS<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                 dst.ptr, dst.step, src.rows, src.cols, bidx);
 
         if (stream == 0)
@@ -1598,10 +1591,10 @@ namespace cv { namespace gpu { namespace improc
         grid.y = divUp(src.rows, threads.y);
 
         if (hrange == 180)
-            imgproc::HLS2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::HLS2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                 dst.ptr, dst.step, src.rows, src.cols, bidx);
         else
-            imgproc::HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                 dst.ptr, dst.step, src.rows, src.cols, bidx);
 
         if (stream == 0)
@@ -1620,7 +1613,7 @@ namespace cv { namespace gpu { namespace improc
         static const int sector_data[][3]=
             {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
 
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHlsSectorData, sector_data, sizeof(sector_data)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHlsSectorData, sector_data, sizeof(sector_data)) );
 
         HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
     }
@@ -1637,7 +1630,7 @@ namespace cv { namespace gpu { namespace improc
         static const int sector_data[][3]=
             {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
 
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHlsSectorData, sector_data, sizeof(sector_data)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHlsSectorData, sector_data, sizeof(sector_data)) );
                 
         HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
     }
diff --git a/modules/gpu/src/cuda/constantspacebp.cu b/modules/gpu/src/cuda/constantspacebp.cu
index 7be11791e4..0602834d2b 100644
--- a/modules/gpu/src/cuda/constantspacebp.cu
+++ b/modules/gpu/src/cuda/constantspacebp.cu
@@ -54,20 +54,18 @@ using namespace cv::gpu;
 #define SHRT_MAX 32767
 #endif
 
-template <typename T>
-struct TypeLimits {};
-
-template <>
-struct TypeLimits<short>
+namespace csbp_krnls
 {
-    static __device__ short max() {return SHRT_MAX;}
-};
-
-template <>
-struct TypeLimits<float>
-{
-    static __device__ float max() {return FLT_MAX;}
-};
+    template <typename T> struct TypeLimits;
+    template <> struct TypeLimits<short>
+    {
+        static __device__ short max() {return SHRT_MAX;}
+    };
+    template <> struct TypeLimits<float>
+    {
+        static __device__ float max() {return FLT_MAX;}
+    };
+}
 
 ///////////////////////////////////////////////////////////////
 /////////////////////// load constants ////////////////////////
diff --git a/modules/gpu/src/cuda/cuda_shared.hpp b/modules/gpu/src/cuda/cuda_shared.hpp
index 449fcb0e1d..a3c5657278 100644
--- a/modules/gpu/src/cuda/cuda_shared.hpp
+++ b/modules/gpu/src/cuda/cuda_shared.hpp
@@ -58,19 +58,8 @@ namespace cv
 
         static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; }
 
-        namespace matrix_operations
-        {            
-            extern "C" void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
-
-            extern "C" void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
-            extern "C" void set_to_with_mask    (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
-
-            extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream = 0);
-        }
-
         template<class T> 
-        inline void uploadConstant(const char* name, const T& value) { cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); }
-
+        static inline void uploadConstant(const char* name, const T& value) { cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); }
     }
 }
 
diff --git a/modules/gpu/src/cuda/filters.cu b/modules/gpu/src/cuda/filters.cu
new file mode 100644
index 0000000000..185cd633a7
--- /dev/null
+++ b/modules/gpu/src/cuda/filters.cu
@@ -0,0 +1,455 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/gpu/devmem2d.hpp"
+#include "saturate_cast.hpp"
+#include "safe_call.hpp"
+#include "cuda_shared.hpp"
+
+using namespace cv::gpu;
+
+#ifndef FLT_MAX
+#define FLT_MAX 3.402823466e+30F
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Linear filters
+
+#define MAX_KERNEL_SIZE 16
+
+namespace filter_krnls
+{
+    __constant__ float cLinearKernel[MAX_KERNEL_SIZE];
+}
+
+namespace cv { namespace gpu { namespace filters
+{
+    void loadLinearKernel(const float kernel[], int ksize)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(filter_krnls::cLinearKernel, kernel, ksize * sizeof(float)) );
+    }
+}}}
+
+namespace filter_krnls
+{
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, typename T, typename D>
+    __global__ void linearRowFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height)
+    {
+        __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
+        
+        const int blockStartX = blockDim.x * blockIdx.x;
+        const int blockStartY = blockDim.y * blockIdx.y;
+
+		const int threadX = blockStartX + threadIdx.x;
+        const int prevThreadX = threadX - blockDim.x;
+        const int nextThreadX = threadX + blockDim.x;
+
+		const int threadY = blockStartY + threadIdx.y;
+
+        T* sDataRow = smem + threadIdx.y * blockDim.x * 3;
+
+        if (threadY < height)
+        {
+            const T* rowSrc = src + threadY * src_step;
+
+            sDataRow[threadIdx.x + blockDim.x] = threadX < width ? rowSrc[threadX] : 0;
+
+            sDataRow[threadIdx.x] = prevThreadX >= 0 ? rowSrc[prevThreadX] : 0;
+
+            sDataRow[(blockDim.x << 1) + threadIdx.x] = nextThreadX < width ? rowSrc[nextThreadX] : 0;
+
+            __syncthreads();
+
+            if (threadX < width)
+            {
+                float sum = 0;
+
+                sDataRow += threadIdx.x + blockDim.x - anchor;
+
+                #pragma unroll
+                for(int i = 0; i < KERNEL_SIZE; ++i)
+                    sum += cLinearKernel[i] * sDataRow[i];
+
+                dst[threadY * dst_step + threadX] = saturate_cast<D>(sum);
+            }
+        }
+    }
+}
+
+namespace cv { namespace gpu { namespace filters
+{
+    template <int KERNEL_SIZE, typename T, typename D>
+    void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)
+    {
+        const int BLOCK_DIM_X = 16;
+        const int BLOCK_DIM_Y = 16;
+
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+        dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
+
+        filter_krnls::linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE><<<blocks, threads>>>(src.ptr, src.elem_step, 
+            dst.ptr, dst.elem_step, anchor, src.cols, src.rows);
+
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+
+    template <typename T, typename D>
+    inline void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);
+        static const caller_t callers[] = 
+        {linearRowFilter_caller<0 , T, D>, linearRowFilter_caller<1 , T, D>, 
+         linearRowFilter_caller<2 , T, D>, linearRowFilter_caller<3 , T, D>, 
+         linearRowFilter_caller<4 , T, D>, linearRowFilter_caller<5 , T, D>, 
+         linearRowFilter_caller<6 , T, D>, linearRowFilter_caller<7 , T, D>, 
+         linearRowFilter_caller<8 , T, D>, linearRowFilter_caller<9 , T, D>, 
+         linearRowFilter_caller<10, T, D>, linearRowFilter_caller<11, T, D>, 
+         linearRowFilter_caller<12, T, D>, linearRowFilter_caller<13, T, D>, 
+         linearRowFilter_caller<14, T, D>, linearRowFilter_caller<15, T, D>};
+
+        loadLinearKernel(kernel, ksize);
+        callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
+    }
+
+    void linearRowFilter_gpu_32s32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearRowFilter_gpu<int, int>(src, dst, kernel, ksize, anchor);
+    }
+    void linearRowFilter_gpu_32s32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearRowFilter_gpu<int, float>(src, dst, kernel, ksize, anchor);
+    }
+    void linearRowFilter_gpu_32f32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearRowFilter_gpu<float, int>(src, dst, kernel, ksize, anchor);
+    }
+    void linearRowFilter_gpu_32f32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearRowFilter_gpu<float, float>(src, dst, kernel, ksize, anchor);
+    }
+}}}
+
+namespace filter_krnls
+{
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, typename T, typename D>
+    __global__ void linearColumnFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height)
+    {
+        __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
+        
+        const int blockStartX = blockDim.x * blockIdx.x;
+        const int blockStartY = blockDim.y * blockIdx.y;
+
+		const int threadX = blockStartX + threadIdx.x;
+
+		const int threadY = blockStartY + threadIdx.y;
+        const int prevThreadY = threadY - blockDim.y;
+        const int nextThreadY = threadY + blockDim.y;
+
+        const int smem_step = blockDim.x;
+
+        T* sDataColumn = smem + threadIdx.x;
+
+        if (threadX < width)
+        {
+            const T* colSrc = src + threadX;
+
+            sDataColumn[(threadIdx.y + blockDim.y) * smem_step] = threadY < height ? colSrc[threadY * src_step] : 0;
+
+            sDataColumn[threadIdx.y * smem_step] = prevThreadY >= 0 ? colSrc[prevThreadY * src_step] : 0;
+
+            sDataColumn[(threadIdx.y + (blockDim.y << 1)) * smem_step] = nextThreadY < height ? colSrc[nextThreadY * src_step] : 0;
+
+            __syncthreads();
+
+            if (threadY < height)
+            {
+                float sum = 0;
+
+                sDataColumn += (threadIdx.y + blockDim.y - anchor)* smem_step;
+
+                #pragma unroll
+                for(int i = 0; i < KERNEL_SIZE; ++i)
+                    sum += cLinearKernel[i] * sDataColumn[i * smem_step];
+
+                dst[threadY * dst_step + threadX] = saturate_cast<D>(sum);
+            }
+        }
+    }
+}
+
+namespace cv { namespace gpu { namespace filters
+{
+    template <int KERNEL_SIZE, typename T, typename D>
+    void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)
+    {
+        const int BLOCK_DIM_X = 16;
+        const int BLOCK_DIM_Y = 16;
+
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+        dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
+
+        filter_krnls::linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE><<<blocks, threads>>>(src.ptr, src.elem_step, 
+            dst.ptr, dst.elem_step, anchor, src.cols, src.rows);
+
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+
+    template <typename T, typename D>
+    inline void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);
+        static const caller_t callers[] = 
+        {linearColumnFilter_caller<0 , T, D>, linearColumnFilter_caller<1 , T, D>, 
+         linearColumnFilter_caller<2 , T, D>, linearColumnFilter_caller<3 , T, D>, 
+         linearColumnFilter_caller<4 , T, D>, linearColumnFilter_caller<5 , T, D>, 
+         linearColumnFilter_caller<6 , T, D>, linearColumnFilter_caller<7 , T, D>, 
+         linearColumnFilter_caller<8 , T, D>, linearColumnFilter_caller<9 , T, D>, 
+         linearColumnFilter_caller<10, T, D>, linearColumnFilter_caller<11, T, D>, 
+         linearColumnFilter_caller<12, T, D>, linearColumnFilter_caller<13, T, D>, 
+         linearColumnFilter_caller<14, T, D>, linearColumnFilter_caller<15, T, D>};
+
+        loadLinearKernel(kernel, ksize);
+        callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
+    }
+
+    void linearColumnFilter_gpu_32s32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearColumnFilter_gpu<int, int>(src, dst, kernel, ksize, anchor);
+    }
+    void linearColumnFilter_gpu_32s32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearColumnFilter_gpu<int, float>(src, dst, kernel, ksize, anchor);
+    }
+    void linearColumnFilter_gpu_32f32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearColumnFilter_gpu<float, int>(src, dst, kernel, ksize, anchor);
+    }
+    void linearColumnFilter_gpu_32f32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearColumnFilter_gpu<float, float>(src, dst, kernel, ksize, anchor);
+    }
+}}}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Bilateral filters
+
+namespace bf_krnls
+{
+    __constant__ float* ctable_color;
+    __constant__ float* ctable_space;
+    __constant__ size_t ctable_space_step;
+
+    __constant__ int cndisp;
+    __constant__ int cradius;
+
+    __constant__ short cedge_disc;
+    __constant__ short cmax_disc;
+}
+
+namespace cv { namespace gpu { namespace bf 
+{
+    void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.ptr, sizeof(table_space.ptr)) );
+        size_t table_space_step = table_space.step / sizeof(float);
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
+        
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
+        
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
+    }
+}}}
+
+namespace bf_krnls
+{
+    template <int channels>
+    struct DistRgbMax
+    {
+        static __device__ uchar calc(const uchar* a, const uchar* b)
+        {
+            uchar x = abs(a[0] - b[0]);
+            uchar y = abs(a[1] - b[1]);
+            uchar z = abs(a[2] - b[2]);
+            return (max(max(x, y), z));
+        }
+    };
+
+    template <>
+    struct DistRgbMax<1>
+    {
+        static __device__ uchar calc(const uchar* a, const uchar* b)
+        {
+            return abs(a[0] - b[0]);
+        }
+    };
+
+    template <int channels, typename T>
+    __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+
+        T dp[5];
+
+        if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
+        {
+            dp[0] = *(disp + (y  ) * disp_step + x + 0);
+            dp[1] = *(disp + (y-1) * disp_step + x + 0);
+            dp[2] = *(disp + (y  ) * disp_step + x - 1);
+            dp[3] = *(disp + (y+1) * disp_step + x + 0);
+            dp[4] = *(disp + (y  ) * disp_step + x + 1);
+
+            if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc)            
+            {
+                const int ymin = max(0, y - cradius);
+                const int xmin = max(0, x - cradius);
+                const int ymax = min(h - 1, y + cradius);
+                const int xmax = min(w - 1, x + cradius);
+
+                float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+
+                const uchar* ic = img + y * img_step + channels * x;
+
+                for(int yi = ymin; yi <= ymax; yi++)
+                {
+                    const T* disp_y = disp + yi * disp_step;
+
+                    for(int xi = xmin; xi <= xmax; xi++)
+                    {
+                        const uchar* in = img + yi * img_step + channels * xi;
+
+                        uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
+
+                        const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)];
+
+                        const T disp_reg = disp_y[xi];
+
+                        cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight;
+                        cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight;
+                        cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight;
+                        cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight;
+                        cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight;
+                    }
+                }
+
+                float minimum = FLT_MAX;
+                int id = 0;
+
+                if (cost[0] < minimum)
+                {
+                    minimum = cost[0];
+                    id = 0;
+                }
+                if (cost[1] < minimum)
+                {
+                    minimum = cost[1];
+                    id = 1;
+                }
+                if (cost[2] < minimum)
+                {
+                    minimum = cost[2];
+                    id = 2;
+                }
+                if (cost[3] < minimum)
+                {
+                    minimum = cost[3];
+                    id = 3;
+                }
+                if (cost[4] < minimum)
+                {
+                    minimum = cost[4];
+                    id = 4;
+                }
+
+                *(disp + y * disp_step + x) = dp[id];
+            }
+        }
+    }
+}
+
+namespace cv { namespace gpu { namespace bf 
+{
+    template <typename T>     
+    void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
+    {
+        dim3 threads(32, 8, 1);
+        dim3 grid(1, 1, 1);
+        grid.x = divUp(disp.cols, threads.x << 1);
+        grid.y = divUp(disp.rows, threads.y);
+
+        switch (channels)
+        {
+        case 1:
+            for (int i = 0; i < iters; ++i)
+            {
+                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols);
+                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols);
+            }
+            break;
+        case 3:
+            for (int i = 0; i < iters; ++i)
+            {
+                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols);
+                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols);
+            }
+            break;
+        default:
+            cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
+        }        
+
+        if (stream != 0)
+            cudaSafeCall( cudaThreadSynchronize() );
+    }
+
+    void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
+    {
+        bilateral_filter_caller(disp, img, channels, iters, stream);
+    }
+
+    void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
+    {
+        bilateral_filter_caller(disp, img, channels, iters, stream);
+    }
+}}}
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
index 938e2d1185..e36a9428ac 100644
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -45,7 +45,7 @@
 using namespace cv::gpu;
 
 /////////////////////////////////// Remap ///////////////////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
     texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap;
 
@@ -123,7 +123,7 @@ namespace imgproc
     }
 }
 
-namespace cv { namespace gpu { namespace improc 
+namespace cv { namespace gpu { namespace imgproc 
 {
     void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
     {
@@ -132,15 +132,15 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(dst.cols, threads.x);
         grid.y = divUp(dst.rows, threads.y);
 
-        imgproc::tex_remap.filterMode = cudaFilterModeLinear;	    
-        imgproc::tex_remap.addressMode[0] = imgproc::tex_remap.addressMode[1] = cudaAddressModeWrap;
+        imgproc_krnls::tex_remap.filterMode = cudaFilterModeLinear;	    
+        imgproc_krnls::tex_remap.addressMode[0] = imgproc_krnls::tex_remap.addressMode[1] = cudaAddressModeWrap;
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-        cudaSafeCall( cudaBindTexture2D(0, imgproc::tex_remap, src.ptr, desc, src.cols, src.rows, src.step) );
+        cudaSafeCall( cudaBindTexture2D(0, imgproc_krnls::tex_remap, src.ptr, desc, src.cols, src.rows, src.step) );
 
-        imgproc::remap_1c<<<grid, threads>>>(xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows);
+        imgproc_krnls::remap_1c<<<grid, threads>>>(xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows);
 
         cudaSafeCall( cudaThreadSynchronize() );  
-        cudaSafeCall( cudaUnbindTexture(imgproc::tex_remap) );
+        cudaSafeCall( cudaUnbindTexture(imgproc_krnls::tex_remap) );
     }
     
     void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(dst.cols, threads.x);
         grid.y = divUp(dst.rows, threads.y);
 
-        imgproc::remap_3c<<<grid, threads>>>(src.ptr, src.step, xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows);
+        imgproc_krnls::remap_3c<<<grid, threads>>>(src.ptr, src.step, xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows);
 
         cudaSafeCall( cudaThreadSynchronize() ); 
     }
@@ -159,7 +159,7 @@ namespace cv { namespace gpu { namespace improc
 
 /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
 
-namespace imgproc
+namespace imgproc_krnls
 {
     texture<uchar4, 2> tex_meanshift;
 
@@ -254,7 +254,7 @@ namespace imgproc
     }
 }
 
-namespace cv { namespace gpu { namespace improc 
+namespace cv { namespace gpu { namespace imgproc 
 {
     extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps)
     {                        
@@ -264,11 +264,11 @@ namespace cv { namespace gpu { namespace improc
         grid.y = divUp(src.rows, threads.y);
 
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-        cudaSafeCall( cudaBindTexture2D( 0, imgproc::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) );
+        cudaSafeCall( cudaBindTexture2D( 0, imgproc_krnls::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) );
 
-        imgproc::meanshift_kernel<<< grid, threads >>>( dst.ptr, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
+        imgproc_krnls::meanshift_kernel<<< grid, threads >>>( dst.ptr, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
         cudaSafeCall( cudaThreadSynchronize() );
-        cudaSafeCall( cudaUnbindTexture( imgproc::tex_meanshift ) );        
+        cudaSafeCall( cudaUnbindTexture( imgproc_krnls::tex_meanshift ) );        
     }
     extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps) 
     {
@@ -278,17 +278,17 @@ namespace cv { namespace gpu { namespace improc
         grid.y = divUp(src.rows, threads.y);
 
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-        cudaSafeCall( cudaBindTexture2D( 0, imgproc::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) );
+        cudaSafeCall( cudaBindTexture2D( 0, imgproc_krnls::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) );
 
-        imgproc::meanshiftproc_kernel<<< grid, threads >>>( dstr.ptr, dstr.step, dstsp.ptr, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
+        imgproc_krnls::meanshiftproc_kernel<<< grid, threads >>>( dstr.ptr, dstr.step, dstsp.ptr, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
         cudaSafeCall( cudaThreadSynchronize() );
-        cudaSafeCall( cudaUnbindTexture( imgproc::tex_meanshift ) );        
+        cudaSafeCall( cudaUnbindTexture( imgproc_krnls::tex_meanshift ) );        
     }
 }}}
 
 /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
 
-namespace imgproc
+namespace imgproc_krnls
 {
     template <typename T>
     __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
@@ -391,7 +391,7 @@ namespace imgproc
     }
 }
 
-namespace cv { namespace gpu { namespace improc 
+namespace cv { namespace gpu { namespace imgproc 
 {
     void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)
     {
@@ -400,7 +400,7 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(src.cols, threads.x << 2);
         grid.y = divUp(src.rows, threads.y);
          
-        imgproc::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, src.cols, src.rows, ndisp);
+        imgproc_krnls::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, src.cols, src.rows, ndisp);
 
         if (stream == 0)
             cudaSafeCall( cudaThreadSynchronize() ); 
@@ -413,7 +413,7 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(src.cols, threads.x << 1);
         grid.y = divUp(src.rows, threads.y);
          
-        imgproc::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(short), dst.ptr, dst.step, src.cols, src.rows, ndisp);
+        imgproc_krnls::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(short), dst.ptr, dst.step, src.cols, src.rows, ndisp);
         
         if (stream == 0)
             cudaSafeCall( cudaThreadSynchronize() );
@@ -422,7 +422,7 @@ namespace cv { namespace gpu { namespace improc
 
 /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
 
-namespace imgproc
+namespace imgproc_krnls
 {
     __constant__ float cq[16];
 
@@ -457,7 +457,7 @@ namespace imgproc
     }
 }
 
-namespace cv { namespace gpu { namespace improc 
+namespace cv { namespace gpu { namespace imgproc 
 {
     template <typename T>
     inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
@@ -467,9 +467,9 @@ namespace cv { namespace gpu { namespace improc
         grid.x = divUp(disp.cols, threads.x);
         grid.y = divUp(disp.rows, threads.y);
 
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cq, q, 16 * sizeof(float)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cq, q, 16 * sizeof(float)) );
 
-        imgproc::reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.ptr, disp.step / sizeof(T), xyzw.ptr, xyzw.step / sizeof(float), disp.rows, disp.cols);
+        imgproc_krnls::reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.ptr, disp.step / sizeof(T), xyzw.ptr, xyzw.step / sizeof(float), disp.rows, disp.cols);
 
         if (stream == 0)
             cudaSafeCall( cudaThreadSynchronize() );
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index fef25ac1fb..aca1c574e6 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -41,6 +41,9 @@
 //M*/
 
 #include "cuda_shared.hpp"
+#include "saturate_cast.hpp"
+#include "transform.hpp"
+#include "vecmath.hpp"
 
 using namespace cv::gpu;
 
@@ -48,6 +51,9 @@ using namespace cv::gpu;
 #define CV_PI   3.1415926535897932384626433832795f
 #endif
 
+//////////////////////////////////////////////////////////////////////////////////////
+// Cart <-> Polar
+
 namespace mathfunc_krnls 
 {
     struct Nothing
@@ -143,8 +149,8 @@ namespace cv { namespace gpu { namespace mathfunc
         const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
 
         mathfunc_krnls::cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
-            x.ptr, x.step / sizeof(float), y.ptr, y.step / sizeof(float), 
-            mag.ptr, mag.step / sizeof(float), angle.ptr, angle.step / sizeof(float), scale, x.cols, x.rows);
+            x.ptr, x.elem_step, y.ptr, y.elem_step, 
+            mag.ptr, mag.elem_step, angle.ptr, angle.elem_step, scale, x.cols, x.rows);
 
         if (stream == 0)
             cudaSafeCall( cudaThreadSynchronize() );
@@ -191,8 +197,8 @@ namespace cv { namespace gpu { namespace mathfunc
         
         const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
 
-        mathfunc_krnls::polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.ptr, mag.step / sizeof(float), 
-            angle.ptr, angle.step / sizeof(float), scale, x.ptr, x.step / sizeof(float), y.ptr, y.step / sizeof(float), mag.cols, mag.rows);
+        mathfunc_krnls::polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.ptr, mag.elem_step, 
+            angle.ptr, angle.elem_step, scale, x.ptr, x.elem_step, y.ptr, y.elem_step, mag.cols, mag.rows);
 
         if (stream == 0)
             cudaSafeCall( cudaThreadSynchronize() );
@@ -210,3 +216,37 @@ namespace cv { namespace gpu { namespace mathfunc
         callers[mag.ptr == 0](mag, angle, x, y, angleInDegrees, stream);
     }
 }}}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Compare
+
+namespace mathfunc_krnls 
+{
+    template <typename T1, typename T2>
+    struct NotEqual
+    {
+        __device__ uchar operator()(const T1& src1, const T2& src2, int, int)
+        {
+            return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
+        }
+    };
+}
+
+namespace cv { namespace gpu { namespace mathfunc 
+{
+    template <typename T1, typename T2>
+    inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    {
+        mathfunc_krnls::NotEqual<T1, T2> op;
+        transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0);
+    }
+
+    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    {
+        compare_ne<uint, uint>(src1, src2, dst);
+    }
+    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    {
+        compare_ne<float, float>(src1, src2, dst);
+    }
+}}}
diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu
index 0b791fa72d..f9a46b4c14 100644
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -47,121 +47,18 @@
 #include "saturate_cast.hpp"
 
 using namespace cv::gpu;
-using namespace cv::gpu::matrix_operations;
 
-
-namespace mat_operators
+namespace matop_krnls
 {
-    __constant__ double scalar_d[4];
-
-
-    template <typename T>
-    class shift_and_sizeof;
-
-    template <>
-    class shift_and_sizeof<char>
-    {
-        public:
-        enum { shift = 0 };
-    };
-
-    template <>
-    class shift_and_sizeof<unsigned char>
-    {
-        public:
-        enum { shift = 0 };
-    };
-
-    template <>
-    class shift_and_sizeof<short>
-    {
-        public:
-        enum { shift = 1 };
-    };
-
-    template <>
-    class shift_and_sizeof<unsigned short>
-    {
-        public:
-        enum { shift = 1 };
-    };
-
-    template <>
-    class shift_and_sizeof<int>
-    {
-        public:
-        enum { shift = 2 };
-    };
-
-    template <>
-    class shift_and_sizeof<float>
-    {
-        public:
-        enum { shift = 2 };
-    };
-
-    template <>
-    class shift_and_sizeof<double>
-    {
-        public:
-        enum { shift = 3 };
-    };
-
-
-    ///////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////// CopyTo /////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////
-
-    template<typename T>
-    __global__ void kernel_copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels)
-    {
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if ((x < cols * channels ) && (y < rows))
-            if (mask[y * step_mask + x / channels] != 0)
-            {
-                size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
-                mat_dst[idx] = mat_src[idx];
-            }
-    }
-
-    ///////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////// SetTo //////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////
-
-    template<typename T>
-    __global__ void kernel_set_to_without_mask(T * mat, int cols, int rows, int step, int channels)
-    {
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if ((x < cols * channels ) && (y < rows))
-        {
-            size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
-            mat[idx] = scalar_d[ x % channels ];
-        }
-    }
-
-    template<typename T>
-    __global__ void kernel_set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int channels, int step_mask)
-    {
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if ((x < cols * channels ) && (y < rows))
-            if (mask[y * step_mask + x / channels] != 0)
-            {
-                size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
-                mat[idx] = scalar_d[ x % channels ];
-            }
-    }
-
-
-    ///////////////////////////////////////////////////////////////////////////
-    //////////////////////////////// ConvertTo ////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////
-
+    template <typename T> struct shift_and_sizeof;
+    template <> struct shift_and_sizeof<char> { enum { shift = 0 }; };
+    template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
+    template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
+    template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
+    template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
+    template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
+    template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
+    
     template <typename T, typename DT, size_t src_elem_size, size_t dst_elem_size>
     struct ReadWriteTraits
     {
@@ -218,9 +115,206 @@ namespace mat_operators
         typedef int2 read_type;
         typedef short2 write_type;
     };
+}
 
+///////////////////////////////////////////////////////////////////////////
+////////////////////////////////// CopyTo /////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
+
+namespace matop_krnls
+{
+    template<typename T>
+    __global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels)
+    {
+        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if ((x < cols * channels ) && (y < rows))
+            if (mask[y * step_mask + x / channels] != 0)
+            {
+                size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
+                mat_dst[idx] = mat_src[idx];
+            }
+    }
+}
+
+namespace cv { namespace gpu { namespace matrix_operations
+{
+    typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
+
+    template<typename T>
+    void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
+    {
+        dim3 threadsPerBlock(16,16, 1);
+        dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
+        if (stream == 0)
+        {
+            ::matop_krnls::copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
+                ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
+            cudaSafeCall ( cudaThreadSynchronize() );
+        }
+        else
+        {
+            ::matop_krnls::copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
+                ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
+        }
+    }
+
+    void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
+    {
+        static CopyToFunc tab[8] =
+        {
+            copy_to_with_mask_run<unsigned char>,
+            copy_to_with_mask_run<char>,
+            copy_to_with_mask_run<unsigned short>,
+            copy_to_with_mask_run<short>,
+            copy_to_with_mask_run<int>,
+            copy_to_with_mask_run<float>,
+            copy_to_with_mask_run<double>,
+            0
+        };
+
+        CopyToFunc func = tab[depth];
+
+        if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
+
+        func(mat_src, mat_dst, mask, channels, stream);
+    }
+}}}
+
+///////////////////////////////////////////////////////////////////////////
+////////////////////////////////// SetTo //////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
+
+namespace matop_krnls
+{
+    __constant__ double scalar_d[4]; 
+
+    template<typename T>
+    __global__ void set_to_without_mask(T * mat, int cols, int rows, int step, int channels)
+    {
+        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if ((x < cols * channels ) && (y < rows))
+        {
+            size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
+            mat[idx] = scalar_d[ x % channels ];
+        }
+    }
+
+    template<typename T>
+    __global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int channels, int step_mask)
+    {
+        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if ((x < cols * channels ) && (y < rows))
+            if (mask[y * step_mask + x / channels] != 0)
+            {
+                size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
+                mat[idx] = scalar_d[ x % channels ];
+            }
+    }
+}
+
+namespace cv { namespace gpu {  namespace matrix_operations
+{
+    typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
+    typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
+
+    template <typename T>
+    void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream)
+    {
+        dim3 threadsPerBlock(32, 8, 1);
+        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
+
+        if (stream == 0)
+        {
+            ::matop_krnls::set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
+            cudaSafeCall ( cudaThreadSynchronize() );
+        }
+        else
+        {
+            ::matop_krnls::set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
+        }
+
+    }
+
+    template <typename T>
+    void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream)
+    {
+        dim3 threadsPerBlock(32, 8, 1);
+        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
+
+        if (stream == 0)
+        {
+            matop_krnls::set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
+            cudaSafeCall ( cudaThreadSynchronize() );
+        }
+        else
+        {
+            matop_krnls::set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
+        }
+    }
+
+    void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(matop_krnls::scalar_d, scalar, sizeof(double) * 4));
+
+        static SetToFunc_without_mask tab[8] =
+        {
+            set_to_without_mask_run<unsigned char>,
+            set_to_without_mask_run<char>,
+            set_to_without_mask_run<unsigned short>,
+            set_to_without_mask_run<short>,
+            set_to_without_mask_run<int>,
+            set_to_without_mask_run<float>,
+            set_to_without_mask_run<double>,
+            0
+        };
+
+        SetToFunc_without_mask func = tab[depth];
+
+        if (func == 0)
+            cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__);
+
+        func(mat, channels, stream);
+    }
+
+    void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(matop_krnls::scalar_d, scalar, sizeof(double) * 4));
+
+        static SetToFunc_with_mask tab[8] =
+        {
+            set_to_with_mask_run<unsigned char>,
+            set_to_with_mask_run<char>,
+            set_to_with_mask_run<unsigned short>,
+            set_to_with_mask_run<short>,
+            set_to_with_mask_run<int>,
+            set_to_with_mask_run<float>,
+            set_to_with_mask_run<double>,
+            0
+        };
+
+        SetToFunc_with_mask func = tab[depth];
+
+        if (func == 0)
+            cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__);
+
+        func(mat, mask, channels, stream);
+    }
+}}}
+
+///////////////////////////////////////////////////////////////////////////
+//////////////////////////////// ConvertTo ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
+
+namespace matop_krnls
+{
     template <typename T, typename DT>
-    __global__ static void kernel_convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta)
+    __global__ static void convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta)
     {
         typedef typename ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::read_type read_type;
         typedef typename ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::write_type write_type;
@@ -253,253 +347,63 @@ namespace mat_operators
                         dst[(x * shift) + i] = saturate_cast<DT>(alpha * src[(x * shift) + i] + beta);
             }
         }
-    }
+    }    
+}
 
-    ///////////////////////////////////////////////////////////////////////////
-    /////////////////////////////// compare_ne ////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////
+namespace cv { namespace gpu { namespace matrix_operations
+{
+    typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream);
 
-    template <typename T>
-    __global__ void kernel_compare_ne(uchar* src1, size_t src1_step, uchar* src2, size_t src2_step, uchar* dst, size_t dst_step, int cols, int rows)
+    template<typename T, typename DT>
+    void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
     {
-        const size_t x = threadIdx.x + blockIdx.x * blockDim.x;
-        const size_t y = threadIdx.y + blockIdx.y * blockDim.y;
+        const int shift = ::matop_krnls::ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
 
-        if (x < cols && y < rows)
+        dim3 block(32, 8);
+        dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
+
+        if (stream == 0)
         {
-            T src1_pix = ((T*)(src1 + y * src1_step))[x];
-            T src2_pix = ((T*)(src2 + y * src2_step))[x];
-            uchar res = (uchar)(src1_pix != src2_pix) * 255;
-            ((dst + y * dst_step))[x] = res;
+            matop_krnls::convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
+            cudaSafeCall( cudaThreadSynchronize() );
+        }
+        else
+        {
+            matop_krnls::convert_to<T, DT><<<grid, block, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
         }
     }
-} // namespace mat_operators
 
-namespace cv
-{
-    namespace gpu
+    void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream)
     {
-        namespace matrix_operations
+        static CvtFunc tab[8][8] =
         {
+            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
+            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
 
-            ///////////////////////////////////////////////////////////////////////////
-            ////////////////////////////////// CopyTo /////////////////////////////////
-            ///////////////////////////////////////////////////////////////////////////
+            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
+            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
 
-            typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
+            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
+            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
 
-            template<typename T>
-            void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
-            {
-                dim3 threadsPerBlock(16,16, 1);
-                dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
-                if (stream == 0)
-                {
-                    ::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
-                        ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
-                    cudaSafeCall ( cudaThreadSynchronize() );
-                }
-                else
-                {
-                    ::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
-                        ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
-                }
-            }
+            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
+            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
 
-            extern "C" void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
-            {
-                static CopyToFunc tab[8] =
-                {
-                    copy_to_with_mask_run<unsigned char>,
-                    copy_to_with_mask_run<char>,
-                    copy_to_with_mask_run<unsigned short>,
-                    copy_to_with_mask_run<short>,
-                    copy_to_with_mask_run<int>,
-                    copy_to_with_mask_run<float>,
-                    copy_to_with_mask_run<double>,
-                    0
-                };
+            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
+            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
 
-                CopyToFunc func = tab[depth];
+            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
+            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
 
-                if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
+            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
+            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
 
-                func(mat_src, mat_dst, mask, channels, stream);
-            }
+            {0,0,0,0,0,0,0,0}
+        };
 
-
-            ///////////////////////////////////////////////////////////////////////////
-            ////////////////////////////////// SetTo //////////////////////////////////
-            ///////////////////////////////////////////////////////////////////////////
-
-            typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
-            typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
-
-            template <typename T>
-            void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream)
-            {
-                dim3 threadsPerBlock(32, 8, 1);
-                dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
-
-                if (stream == 0)
-                {
-                    ::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
-                    cudaSafeCall ( cudaThreadSynchronize() );
-                }
-                else
-                {
-                    ::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
-                }
-
-            }
-
-            template <typename T>
-            void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream)
-            {
-                dim3 threadsPerBlock(32, 8, 1);
-                dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
-
-                if (stream == 0)
-                {
-                    mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
-                    cudaSafeCall ( cudaThreadSynchronize() );
-                }
-                else
-                {
-                    mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
-                }
-            }
-
-            extern "C" void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream)
-            {
-                cudaSafeCall( cudaMemcpyToSymbol(mat_operators::scalar_d, scalar, sizeof(double) * 4));
-
-                static SetToFunc_without_mask tab[8] =
-                {
-                    set_to_without_mask_run<unsigned char>,
-                    set_to_without_mask_run<char>,
-                    set_to_without_mask_run<unsigned short>,
-                    set_to_without_mask_run<short>,
-                    set_to_without_mask_run<int>,
-                    set_to_without_mask_run<float>,
-                    set_to_without_mask_run<double>,
-                    0
-                };
-
-                SetToFunc_without_mask func = tab[depth];
-
-                if (func == 0)
-                    cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__);
-
-                func(mat, channels, stream);
-            }
-
-
-            extern "C" void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
-            {
-                cudaSafeCall( cudaMemcpyToSymbol(mat_operators::scalar_d, scalar, sizeof(double) * 4));
-
-                static SetToFunc_with_mask tab[8] =
-                {
-                    set_to_with_mask_run<unsigned char>,
-                    set_to_with_mask_run<char>,
-                    set_to_with_mask_run<unsigned short>,
-                    set_to_with_mask_run<short>,
-                    set_to_with_mask_run<int>,
-                    set_to_with_mask_run<float>,
-                    set_to_with_mask_run<double>,
-                    0
-                };
-
-                SetToFunc_with_mask func = tab[depth];
-
-                if (func == 0)
-                    cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__);
-
-                func(mat, mask, channels, stream);
-            }
-
-
-            ///////////////////////////////////////////////////////////////////////////
-            //////////////////////////////// ConvertTo ////////////////////////////////
-            ///////////////////////////////////////////////////////////////////////////
-
-            typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream);
-
-            template<typename T, typename DT>
-            void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
-            {
-                const int shift = ::mat_operators::ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
-
-                if (stream == 0)
-                {
-                    mat_operators::kernel_convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
-                    cudaSafeCall( cudaThreadSynchronize() );
-                }
-                else
-                {
-                    mat_operators::kernel_convert_to<T, DT><<<grid, block, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
-                }
-            }
-
-            extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream)
-            {
-                static CvtFunc tab[8][8] =
-                {
-                    {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
-                    cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
-
-                    {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
-                    cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
-
-                    {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
-                    cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
-
-                    {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
-                    cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
-
-                    {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
-                    cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
-
-                    {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
-                    cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
-
-                    {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
-                    cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
-
-                    {0,0,0,0,0,0,0,0}
-                };
-
-                CvtFunc func = tab[sdepth][ddepth];
-                if (func == 0)
-                    cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
-                func(src, dst, src.cols * channels, src.rows, alpha, beta, stream);
-            }
-
-            ///////////////////////////////////////////////////////////////////////////
-            /////////////////////////////// compare_ne ////////////////////////////////
-            ///////////////////////////////////////////////////////////////////////////
-
-            void compare_ne_8u(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(src1.cols, block.x), divUp(src1.rows, block.y));
-
-                mat_operators::kernel_compare_ne<uint><<<grid, block>>>(src1.ptr, src1.step, src2.ptr, src2.step, dst.ptr, dst.step, src1.cols, src1.rows);
-                cudaSafeCall( cudaThreadSynchronize() );
-            }
-
-            void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(src1.cols, block.x), divUp(src1.rows, block.y));
-
-                mat_operators::kernel_compare_ne<float><<<grid, block>>>(src1.ptr, src1.step, src2.ptr, src2.step, dst.ptr, dst.step, src1.cols, src1.rows);
-                cudaSafeCall( cudaThreadSynchronize() );
-            }
-        } // namespace matrix_operations
-    } // namespace gpu
-} // namespace cv
+        CvtFunc func = tab[sdepth][ddepth];
+        if (func == 0)
+            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
+        func(src, dst, src.cols * channels, src.rows, alpha, beta, stream);
+    }
+}}}
diff --git a/modules/gpu/src/cuda/saturate_cast.hpp b/modules/gpu/src/cuda/saturate_cast.hpp
index 2b58eb1190..e5a5a83600 100644
--- a/modules/gpu/src/cuda/saturate_cast.hpp
+++ b/modules/gpu/src/cuda/saturate_cast.hpp
@@ -49,124 +49,206 @@ namespace cv
 {
     namespace gpu
     {
-        // To fix link error: this func already defined in other obj file
-        namespace 
+        template<typename _Tp> static __device__ _Tp saturate_cast(uchar v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(schar v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(ushort v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(short v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(uint v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(int v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(float v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(double v) { return _Tp(v); }
+
+        template<> static __device__ uchar saturate_cast<uchar>(schar v)
+        { return (uchar)max((int)v, 0); }
+        template<> static __device__ uchar saturate_cast<uchar>(ushort v)
+        { return (uchar)min((uint)v, (uint)UCHAR_MAX); }
+        template<> static __device__ uchar saturate_cast<uchar>(int v)
+        { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+        template<> static __device__ uchar saturate_cast<uchar>(uint v)
+        { return (uchar)min(v, (uint)UCHAR_MAX); }
+        template<> static __device__ uchar saturate_cast<uchar>(short v)
+        { return saturate_cast<uchar>((uint)v); }
+
+        template<> static __device__ uchar saturate_cast<uchar>(float v)
+        { int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }
+        template<> static __device__ uchar saturate_cast<uchar>(double v)
         {
-            template<typename _Tp> __device__ _Tp saturate_cast(uchar v) { return _Tp(v); }
-            template<typename _Tp> __device__ _Tp saturate_cast(schar v) { return _Tp(v); }
-            template<typename _Tp> __device__ _Tp saturate_cast(ushort v) { return _Tp(v); }
-            template<typename _Tp> __device__ _Tp saturate_cast(short v) { return _Tp(v); }
-            template<typename _Tp> __device__ _Tp saturate_cast(uint v) { return _Tp(v); }
-            template<typename _Tp> __device__ _Tp saturate_cast(int v) { return _Tp(v); }
-            template<typename _Tp> __device__ _Tp saturate_cast(float v) { return _Tp(v); }
-            template<typename _Tp> __device__ _Tp saturate_cast(double v) { return _Tp(v); }
-
-            template<> __device__ uchar saturate_cast<uchar>(schar v)
-            { return (uchar)max((int)v, 0); }
-            template<> __device__ uchar saturate_cast<uchar>(ushort v)
-            { return (uchar)min((uint)v, (uint)UCHAR_MAX); }
-            template<> __device__ uchar saturate_cast<uchar>(int v)
-            { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-            template<> __device__ uchar saturate_cast<uchar>(uint v)
-            { return (uchar)min(v, (uint)UCHAR_MAX); }
-            template<> __device__ uchar saturate_cast<uchar>(short v)
-            { return saturate_cast<uchar>((uint)v); }
-
-            template<> __device__ uchar saturate_cast<uchar>(float v)
-            { int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }
-            template<> __device__ uchar saturate_cast<uchar>(double v)
-            {
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);
-            #else
-                return saturate_cast<uchar>((float)v);
-            #endif
-            }
-
-            template<> __device__ schar saturate_cast<schar>(uchar v)
-            { return (schar)min((int)v, SCHAR_MAX); }
-            template<> __device__ schar saturate_cast<schar>(ushort v)
-            { return (schar)min((uint)v, (uint)SCHAR_MAX); }
-            template<> __device__ schar saturate_cast<schar>(int v)
-            {
-                return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ?
-                            v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
-            }
-            template<> __device__ schar saturate_cast<schar>(short v)
-            { return saturate_cast<schar>((int)v); }
-            template<> __device__ schar saturate_cast<schar>(uint v)
-            { return (schar)min(v, (uint)SCHAR_MAX); }
-
-            template<> __device__ schar saturate_cast<schar>(float v)
-            { int iv = __float2int_rn(v); return saturate_cast<schar>(iv); }
-            template<> __device__ schar saturate_cast<schar>(double v)
-            {             
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                int iv = __double2int_rn(v); return saturate_cast<schar>(iv);
-            #else
-                return saturate_cast<schar>((float)v);
-            #endif
-            }
-
-            template<> __device__ ushort saturate_cast<ushort>(schar v)
-            { return (ushort)max((int)v, 0); }
-            template<> __device__ ushort saturate_cast<ushort>(short v)
-            { return (ushort)max((int)v, 0); }
-            template<> __device__ ushort saturate_cast<ushort>(int v)
-            { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-            template<> __device__ ushort saturate_cast<ushort>(uint v)
-            { return (ushort)min(v, (uint)USHRT_MAX); }
-            template<> __device__ ushort saturate_cast<ushort>(float v)
-            { int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); }
-            template<> __device__ ushort saturate_cast<ushort>(double v)
-            {             
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                int iv = __double2int_rn(v); return saturate_cast<ushort>(iv);
-            #else
-                return saturate_cast<ushort>((float)v);
-            #endif
-            }
-
-            template<> __device__ short saturate_cast<short>(ushort v)
-            { return (short)min((int)v, SHRT_MAX); }
-            template<> __device__ short saturate_cast<short>(int v)
-            {
-                return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ?
-                        v : v > 0 ? SHRT_MAX : SHRT_MIN);
-            }
-            template<> __device__ short saturate_cast<short>(uint v)
-            { return (short)min(v, (uint)SHRT_MAX); }
-            template<> __device__ short saturate_cast<short>(float v)
-            { int iv = __float2int_rn(v); return saturate_cast<short>(iv); }
-            template<> __device__ short saturate_cast<short>(double v)
-            {            
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                int iv = __double2int_rn(v); return saturate_cast<short>(iv);
-            #else
-                return saturate_cast<short>((float)v);
-            #endif
-            }
-
-            template<> __device__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
-            template<> __device__ int saturate_cast<int>(double v) 
-            {
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 
-                return __double2int_rn(v);
-            #else
-                return saturate_cast<int>((float)v);
-            #endif
-            }
-
-            template<> __device__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
-            template<> __device__ uint saturate_cast<uint>(double v) 
-            {            
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                return __double2uint_rn(v);
-            #else
-                return saturate_cast<uint>((float)v);
-            #endif
-            }
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+            int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);
+        #else
+            return saturate_cast<uchar>((float)v);
+        #endif
         }
+
+        template<> static __device__ schar saturate_cast<schar>(uchar v)
+        { return (schar)min((int)v, SCHAR_MAX); }
+        template<> static __device__ schar saturate_cast<schar>(ushort v)
+        { return (schar)min((uint)v, (uint)SCHAR_MAX); }
+        template<> static __device__ schar saturate_cast<schar>(int v)
+        {
+            return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ?
+                        v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
+        }
+        template<> static __device__ schar saturate_cast<schar>(short v)
+        { return saturate_cast<schar>((int)v); }
+        template<> static __device__ schar saturate_cast<schar>(uint v)
+        { return (schar)min(v, (uint)SCHAR_MAX); }
+
+        template<> static __device__ schar saturate_cast<schar>(float v)
+        { int iv = __float2int_rn(v); return saturate_cast<schar>(iv); }
+        template<> static __device__ schar saturate_cast<schar>(double v)
+        {             
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+            int iv = __double2int_rn(v); return saturate_cast<schar>(iv);
+        #else
+            return saturate_cast<schar>((float)v);
+        #endif
+        }
+
+        template<> static __device__ ushort saturate_cast<ushort>(schar v)
+        { return (ushort)max((int)v, 0); }
+        template<> static __device__ ushort saturate_cast<ushort>(short v)
+        { return (ushort)max((int)v, 0); }
+        template<> static __device__ ushort saturate_cast<ushort>(int v)
+        { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+        template<> static __device__ ushort saturate_cast<ushort>(uint v)
+        { return (ushort)min(v, (uint)USHRT_MAX); }
+        template<> static __device__ ushort saturate_cast<ushort>(float v)
+        { int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); }
+        template<> static __device__ ushort saturate_cast<ushort>(double v)
+        {             
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+            int iv = __double2int_rn(v); return saturate_cast<ushort>(iv);
+        #else
+            return saturate_cast<ushort>((float)v);
+        #endif
+        }
+
+        template<> static __device__ short saturate_cast<short>(ushort v)
+        { return (short)min((int)v, SHRT_MAX); }
+        template<> static __device__ short saturate_cast<short>(int v)
+        {
+            return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ?
+                    v : v > 0 ? SHRT_MAX : SHRT_MIN);
+        }
+        template<> static __device__ short saturate_cast<short>(uint v)
+        { return (short)min(v, (uint)SHRT_MAX); }
+        template<> static __device__ short saturate_cast<short>(float v)
+        { int iv = __float2int_rn(v); return saturate_cast<short>(iv); }
+        template<> static __device__ short saturate_cast<short>(double v)
+        {            
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+            int iv = __double2int_rn(v); return saturate_cast<short>(iv);
+        #else
+            return saturate_cast<short>((float)v);
+        #endif
+        }
+
+        template<> static __device__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
+        template<> static __device__ int saturate_cast<int>(double v) 
+        {
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 
+            return __double2int_rn(v);
+        #else
+            return saturate_cast<int>((float)v);
+        #endif
+        }
+
+        template<> static __device__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
+        template<> static __device__ uint saturate_cast<uint>(double v) 
+        {            
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+            return __double2uint_rn(v);
+        #else
+            return saturate_cast<uint>((float)v);
+        #endif
+        }
+
+        template<typename _Tp> static __device__ _Tp saturate_cast(uchar4 v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(char4 v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(ushort4 v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(short4 v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(uint4 v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(int4 v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(float4 v) { return _Tp(v); }
+
+        template<> static __device__ uchar4 saturate_cast<uchar4>(char4 v)
+        { return make_uchar4(saturate_cast<uchar>(v.x), saturate_cast<uchar>(v.y), saturate_cast<uchar>(v.z), saturate_cast<uchar>(v.w)); }
+        template<> static __device__ uchar4 saturate_cast<uchar4>(ushort4 v)
+        { return make_uchar4(saturate_cast<uchar>(v.x), saturate_cast<uchar>(v.y), saturate_cast<uchar>(v.z), saturate_cast<uchar>(v.w)); }
+        template<> static __device__ uchar4 saturate_cast<uchar4>(short4 v)
+        { return make_uchar4(saturate_cast<uchar>(v.x), saturate_cast<uchar>(v.y), saturate_cast<uchar>(v.z), saturate_cast<uchar>(v.w)); }
+        template<> static __device__ uchar4 saturate_cast<uchar4>(uint4 v)
+        { return make_uchar4(saturate_cast<uchar>(v.x), saturate_cast<uchar>(v.y), saturate_cast<uchar>(v.z), saturate_cast<uchar>(v.w)); }
+        template<> static __device__ uchar4 saturate_cast<uchar4>(int4 v)
+        { return make_uchar4(saturate_cast<uchar>(v.x), saturate_cast<uchar>(v.y), saturate_cast<uchar>(v.z), saturate_cast<uchar>(v.w)); }
+        template<> static __device__ uchar4 saturate_cast<uchar4>(float4 v)
+        { return make_uchar4(saturate_cast<uchar>(v.x), saturate_cast<uchar>(v.y), saturate_cast<uchar>(v.z), saturate_cast<uchar>(v.w)); }
+
+        template<> static __device__ char4 saturate_cast<char4>(uchar4 v)
+        { return make_char4(saturate_cast<char>(v.x), saturate_cast<char>(v.y), saturate_cast<char>(v.z), saturate_cast<char>(v.w)); }
+        template<> static __device__ char4 saturate_cast<char4>(ushort4 v)
+        { return make_char4(saturate_cast<char>(v.x), saturate_cast<char>(v.y), saturate_cast<char>(v.z), saturate_cast<char>(v.w)); }
+        template<> static __device__ char4 saturate_cast<char4>(short4 v)
+        { return make_char4(saturate_cast<char>(v.x), saturate_cast<char>(v.y), saturate_cast<char>(v.z), saturate_cast<char>(v.w)); }
+        template<> static __device__ char4 saturate_cast<char4>(uint4 v)
+        { return make_char4(saturate_cast<char>(v.x), saturate_cast<char>(v.y), saturate_cast<char>(v.z), saturate_cast<char>(v.w)); }
+        template<> static __device__ char4 saturate_cast<char4>(int4 v)
+        { return make_char4(saturate_cast<char>(v.x), saturate_cast<char>(v.y), saturate_cast<char>(v.z), saturate_cast<char>(v.w)); }
+        template<> static __device__ char4 saturate_cast<char4>(float4 v)
+        { return make_char4(saturate_cast<char>(v.x), saturate_cast<char>(v.y), saturate_cast<char>(v.z), saturate_cast<char>(v.w)); }
+
+        template<> static __device__ ushort4 saturate_cast<ushort4>(uchar4 v)
+        { return make_ushort4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ ushort4 saturate_cast<ushort4>(char4 v)
+        { return make_ushort4(saturate_cast<ushort>(v.x), saturate_cast<ushort>(v.y), saturate_cast<ushort>(v.z), saturate_cast<ushort>(v.w)); }
+        template<> static __device__ ushort4 saturate_cast<ushort4>(short4 v)
+        { return make_ushort4(saturate_cast<ushort>(v.x), saturate_cast<ushort>(v.y), saturate_cast<ushort>(v.z), saturate_cast<ushort>(v.w)); }
+        template<> static __device__ ushort4 saturate_cast<ushort4>(uint4 v)
+        { return make_ushort4(saturate_cast<ushort>(v.x), saturate_cast<ushort>(v.y), saturate_cast<ushort>(v.z), saturate_cast<ushort>(v.w)); }
+        template<> static __device__ ushort4 saturate_cast<ushort4>(int4 v)
+        { return make_ushort4(saturate_cast<ushort>(v.x), saturate_cast<ushort>(v.y), saturate_cast<ushort>(v.z), saturate_cast<ushort>(v.w)); }
+        template<> static __device__ ushort4 saturate_cast<ushort4>(float4 v)
+        { return make_ushort4(saturate_cast<ushort>(v.x), saturate_cast<ushort>(v.y), saturate_cast<ushort>(v.z), saturate_cast<ushort>(v.w)); }
+
+        template<> static __device__ short4 saturate_cast<short4>(uchar4 v)
+        { return make_short4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ short4 saturate_cast<short4>(char4 v)
+        { return make_short4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ short4 saturate_cast<short4>(ushort4 v)
+        { return make_short4(saturate_cast<short>(v.x), saturate_cast<short>(v.y), saturate_cast<short>(v.z), saturate_cast<short>(v.w)); }
+        template<> static __device__ short4 saturate_cast<short4>(uint4 v)
+        { return make_short4(saturate_cast<short>(v.x), saturate_cast<short>(v.y), saturate_cast<short>(v.z), saturate_cast<short>(v.w)); }
+        template<> static __device__ short4 saturate_cast<short4>(int4 v)
+        { return make_short4(saturate_cast<short>(v.x), saturate_cast<short>(v.y), saturate_cast<short>(v.z), saturate_cast<short>(v.w)); }
+        template<> static __device__ short4 saturate_cast<short4>(float4 v)
+        { return make_short4(saturate_cast<short>(v.x), saturate_cast<short>(v.y), saturate_cast<short>(v.z), saturate_cast<short>(v.w)); }
+        
+        template<> static __device__ uint4 saturate_cast<uint4>(uchar4 v)
+        { return make_uint4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ uint4 saturate_cast<uint4>(char4 v)
+        { return make_uint4(saturate_cast<uint>(v.x), saturate_cast<uint>(v.y), saturate_cast<uint>(v.z), saturate_cast<uint>(v.w)); }
+        template<> static __device__ uint4 saturate_cast<uint4>(ushort4 v)
+        { return make_uint4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ uint4 saturate_cast<uint4>(short4 v)
+        { return make_uint4(saturate_cast<uint>(v.x), saturate_cast<uint>(v.y), saturate_cast<uint>(v.z), saturate_cast<uint>(v.w)); }
+        template<> static __device__ uint4 saturate_cast<uint4>(int4 v)
+        { return make_uint4(saturate_cast<uint>(v.x), saturate_cast<uint>(v.y), saturate_cast<uint>(v.z), saturate_cast<uint>(v.w)); }
+        template<> static __device__ uint4 saturate_cast<uint4>(float4 v)
+        { return make_uint4(saturate_cast<uint>(v.x), saturate_cast<uint>(v.y), saturate_cast<uint>(v.z), saturate_cast<uint>(v.w)); }
+        
+        template<> static __device__ int4 saturate_cast<int4>(uchar4 v)
+        { return make_int4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ int4 saturate_cast<int4>(char4 v)
+        { return make_int4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ int4 saturate_cast<int4>(ushort4 v)
+        { return make_int4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ int4 saturate_cast<int4>(short4 v)
+        { return make_int4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ int4 saturate_cast<int4>(uint4 v)
+        { return make_int4(saturate_cast<int>(v.x), saturate_cast<int>(v.y), saturate_cast<int>(v.z), saturate_cast<int>(v.w)); }
+        template<> static __device__ int4 saturate_cast<int4>(float4 v)
+        { return make_int4(saturate_cast<int>(v.x), saturate_cast<int>(v.y), saturate_cast<int>(v.z), saturate_cast<int>(v.w)); }
     }
 }
 
diff --git a/modules/gpu/src/cuda/transform.hpp b/modules/gpu/src/cuda/transform.hpp
new file mode 100644
index 0000000000..43ed19e952
--- /dev/null
+++ b/modules/gpu/src/cuda/transform.hpp
@@ -0,0 +1,118 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_TRANSFORM_HPP__
+#define __OPENCV_GPU_TRANSFORM_HPP__
+
+#include "cuda_shared.hpp"
+#include "saturate_cast.hpp"
+#include "vecmath.hpp"
+
+namespace cv { namespace gpu { namespace algo_krnls
+{
+    template <typename T, typename D, typename UnOp>
+    static __global__ void transform(const T* src, size_t src_step, 
+                                     D* dst, size_t dst_step, int width, int height, UnOp op)
+    {
+		const int x = blockDim.x * blockIdx.x + threadIdx.x;
+		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < width && y < height)
+        {
+            T src_data = src[y * src_step + x];
+            dst[y * dst_step + x] = op(src_data, x, y);
+        }
+    }
+    template <typename T1, typename T2, typename D, typename BinOp>
+    static __global__ void transform(const T1* src1, size_t src1_step, const T2* src2, size_t src2_step, 
+                                     D* dst, size_t dst_step, int width, int height, BinOp op)
+    {
+		const int x = blockDim.x * blockIdx.x + threadIdx.x;
+		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < width && y < height)
+        {
+            T1 src1_data = src1[y * src1_step + x];
+            T2 src2_data = src2[y * src2_step + x];
+            dst[y * dst_step + x] = op(src1_data, src2_data, x, y);
+        }
+    }
+}}}
+
+namespace cv 
+{ 
+    namespace gpu 
+    {
+        template <typename T, typename D, typename UnOp>
+        static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, cudaStream_t stream)
+        {
+            dim3 threads(16, 16, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(src.cols, threads.x);
+            grid.y = divUp(src.rows, threads.y);        
+
+            algo_krnls::transform<<<grid, threads, 0, stream>>>(src.ptr, src.elem_step, 
+                dst.ptr, dst.elem_step, src.cols, src.rows, op);
+
+            if (stream == 0)
+                cudaSafeCall( cudaThreadSynchronize() );
+        }
+        template <typename T1, typename T2, typename D, typename BinOp>
+        static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, BinOp op, cudaStream_t stream)
+        {
+            dim3 threads(16, 16, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(src1.cols, threads.x);
+            grid.y = divUp(src1.rows, threads.y);        
+
+            algo_krnls::transform<<<grid, threads, 0, stream>>>(src1.ptr, src1.elem_step, 
+                src2.ptr, src2.elem_step, dst.ptr, dst.elem_step, src1.cols, src1.rows, op);
+
+            if (stream == 0)
+                cudaSafeCall( cudaThreadSynchronize() );
+        }
+    }
+}
+
+#endif // __OPENCV_GPU_TRANSFORM_HPP__
diff --git a/modules/gpu/src/cuda/vecmath.hpp b/modules/gpu/src/cuda/vecmath.hpp
new file mode 100644
index 0000000000..225e9584bd
--- /dev/null
+++ b/modules/gpu/src/cuda/vecmath.hpp
@@ -0,0 +1,126 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_VECMATH_HPP__
+#define __OPENCV_GPU_VECMATH_HPP__
+
+#include "cuda_shared.hpp"
+
+namespace cv
+{
+    namespace gpu
+    {
+        template<typename T, int N> struct TypeVec;
+        template<typename T> struct TypeVec<T, 1> { typedef T vec_t; };
+        template<> struct TypeVec<unsigned char, 2> { typedef uchar2 vec_t; };
+        template<> struct TypeVec<uchar2, 2> { typedef uchar2 vec_t; };
+        template<> struct TypeVec<unsigned char, 3> { typedef uchar3 vec_t; };;
+        template<> struct TypeVec<uchar3, 3> { typedef uchar3 vec_t; };
+        template<> struct TypeVec<unsigned char, 4> { typedef uchar4 vec_t; };;
+        template<> struct TypeVec<uchar4, 4> { typedef uchar4 vec_t; };
+        template<> struct TypeVec<char, 2> { typedef char2 vec_t; };
+        template<> struct TypeVec<char2, 2> { typedef char2 vec_t; };
+        template<> struct TypeVec<char, 3> { typedef char3 vec_t; };
+        template<> struct TypeVec<char3, 3> { typedef char3 vec_t; };
+        template<> struct TypeVec<char, 4> { typedef char4 vec_t; };
+        template<> struct TypeVec<char4, 4> { typedef char4 vec_t; };
+        template<> struct TypeVec<unsigned short, 2> { typedef ushort2 vec_t; };
+        template<> struct TypeVec<ushort2, 2> { typedef ushort2 vec_t; };
+        template<> struct TypeVec<unsigned short, 3> { typedef ushort3 vec_t; };
+        template<> struct TypeVec<ushort3, 3> { typedef ushort3 vec_t; };
+        template<> struct TypeVec<unsigned short, 4> { typedef ushort4 vec_t; };
+        template<> struct TypeVec<ushort4, 4> { typedef ushort4 vec_t; };
+        template<> struct TypeVec<short, 2> { typedef short2 vec_t; };
+        template<> struct TypeVec<short2, 2> { typedef short2 vec_t; };
+        template<> struct TypeVec<short, 3> { typedef short3 vec_t; };
+        template<> struct TypeVec<short3, 3> { typedef short3 vec_t; };
+        template<> struct TypeVec<short, 4> { typedef short4 vec_t; };
+        template<> struct TypeVec<short4, 4> { typedef short4 vec_t; };
+        template<> struct TypeVec<unsigned int, 2> { typedef uint2 vec_t; };
+        template<> struct TypeVec<uint2, 2> { typedef uint2 vec_t; };
+        template<> struct TypeVec<unsigned int, 3> { typedef uint3 vec_t; };
+        template<> struct TypeVec<uint3, 3> { typedef uint3 vec_t; };
+        template<> struct TypeVec<unsigned int, 4> { typedef uint4 vec_t; };
+        template<> struct TypeVec<uint4, 4> { typedef uint4 vec_t; };
+        template<> struct TypeVec<int, 2> { typedef int2 vec_t; };
+        template<> struct TypeVec<int2, 2> { typedef int2 vec_t; };
+        template<> struct TypeVec<int, 3> { typedef int3 vec_t; };
+        template<> struct TypeVec<int3, 3> { typedef int3 vec_t; };
+        template<> struct TypeVec<int, 4> { typedef int4 vec_t; };
+        template<> struct TypeVec<int4, 4> { typedef int4 vec_t; };
+        template<> struct TypeVec<float, 2> { typedef float2 vec_t; };
+        template<> struct TypeVec<float2, 2> { typedef float2 vec_t; };
+        template<> struct TypeVec<float, 3> { typedef float3 vec_t; };
+        template<> struct TypeVec<float3, 3> { typedef float3 vec_t; };
+        template<> struct TypeVec<float, 4> { typedef float4 vec_t; };
+        template<> struct TypeVec<float4, 4> { typedef float4 vec_t; };        
+
+        static __device__ uchar4 operator+(const uchar4& a, const uchar4& b)
+        {
+            return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+        }
+        static __device__ uchar4 operator-(const uchar4& a, const uchar4& b)
+        {
+            return make_uchar4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+        }
+        static __device__ uchar4 operator*(const uchar4& a, const uchar4& b)
+        {
+            return make_uchar4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+        }
+        static __device__ uchar4 operator/(const uchar4& a, const uchar4& b)
+        {
+            return make_uchar4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+        }
+        template <typename T>
+        static __device__ uchar4 operator*(const uchar4& a, T s)
+        {
+            return make_uchar4(a.x * s, a.y * s, a.z * s, a.w * s);
+        }
+        template <typename T>
+        static __device__ uchar4 operator*(T s, const uchar4& a)
+        {
+            return a * s;
+        }
+    }
+}
+
+#endif // __OPENCV_GPU_VECMATH_HPP__
\ No newline at end of file
diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp
index f933453e83..7f93c12fc4 100644
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -69,6 +69,22 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int
 
 #include "opencv2/gpu/stream_accessor.hpp"
 
+namespace cv 
+{
+    namespace gpu
+    {
+        namespace matrix_operations
+        {            
+            void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+
+            void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
+            void set_to_with_mask    (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+
+            void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream = 0);
+        }
+    }
+}
+
 struct Stream::Impl
 {
     cudaStream_t stream;
diff --git a/modules/gpu/src/filtering_npp.cpp b/modules/gpu/src/filtering.cpp
similarity index 79%
rename from modules/gpu/src/filtering_npp.cpp
rename to modules/gpu/src/filtering.cpp
index a9aceb50ef..a87d194274 100644
--- a/modules/gpu/src/filtering_npp.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -49,18 +49,18 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA)
 
 Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, bool) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
 Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int, int, const Size&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const GpuMat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
+Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const GpuMat&, const Size&, Point, int) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
+Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const GpuMat&, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const GpuMat&, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, bool) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
+Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
+Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
@@ -71,7 +71,7 @@ void cv::gpu::erode( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nog
 void cv::gpu::dilate( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); }
 void cv::gpu::morphologyEx( const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_nogpu(); }
 void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point) { throw_nogpu(); }
-void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, bool) { throw_nogpu(); }
+void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point) { throw_nogpu(); }
 void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double) { throw_nogpu(); }
 void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double) { throw_nogpu(); }
 void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double) { throw_nogpu(); }
@@ -164,28 +164,10 @@ Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU> filt
 
 namespace
 {
-    struct RowColumnFilterApply
-    {
-        void operator()(Ptr<BaseRowFilter_GPU>& rowFilter, Ptr<BaseColumnFilter_GPU>& columnFilter, 
-            GpuMat& srcROI, GpuMat& dstROI, GpuMat& dstBufROI)
-        {
-            (*rowFilter)(srcROI, dstBufROI);
-            (*columnFilter)(dstBufROI, dstROI);
-        }
-    };
-    struct ColumnRowFilterApply
-    {
-        void operator()(Ptr<BaseRowFilter_GPU>& rowFilter, Ptr<BaseColumnFilter_GPU>& columnFilter, 
-            GpuMat& srcROI, GpuMat& dstROI, GpuMat& dstBufROI)
-        {
-            (*columnFilter)(srcROI, dstBufROI);
-            (*rowFilter)(dstBufROI, dstROI);
-        }
-    };
-    class SeparableFilterEngine_GPU_base : public FilterEngine_GPU
+    class SeparableFilterEngine_GPU : public FilterEngine_GPU
     {
     public:
-        SeparableFilterEngine_GPU_base(const Ptr<BaseRowFilter_GPU>& rowFilter_, 
+        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, 
                                        const Ptr<BaseColumnFilter_GPU>& columnFilter_) :
             rowFilter(rowFilter_), columnFilter(columnFilter_)
         {
@@ -208,6 +190,9 @@ namespace
             srcROI = src(roi);
             dstROI = dst(roi);
             dstBufROI = dstBuf(roi);
+            
+            (*rowFilter)(srcROI, dstBufROI);
+            (*columnFilter)(dstBufROI, dstROI);
         }
 
         Ptr<BaseRowFilter_GPU> rowFilter;
@@ -219,32 +204,12 @@ namespace
         GpuMat dstROI;
         GpuMat dstBufROI;
     };
-    template <typename FA>
-    class SeparableFilterEngine_GPU : public SeparableFilterEngine_GPU_base
-    {
-    public:
-        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, 
-                                  const Ptr<BaseColumnFilter_GPU>& columnFilter_, FA fa_) :
-            SeparableFilterEngine_GPU_base(rowFilter_, columnFilter_), fa(fa_)
-        {
-        }
-
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1))
-        {
-            SeparableFilterEngine_GPU_base::apply(src, dst, roi);
-            fa(rowFilter, columnFilter, srcROI, dstROI, dstBufROI);
-        }
-
-        FA fa;
-    };
 }
 
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter, 
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, bool rowFilterFirst)
+    const Ptr<BaseColumnFilter_GPU>& columnFilter)
 {
-    if (rowFilterFirst)
-        return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU<RowColumnFilterApply>(rowFilter, columnFilter, RowColumnFilterApply()));
-    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU<ColumnRowFilterApply>(rowFilter, columnFilter, ColumnRowFilterApply()));
+    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter));
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -398,7 +363,7 @@ namespace
     };
 }
 
-Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int op, int type, const GpuMat& kernel, const Size& ksize, Point anchor)
+Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor)
 {
     static const nppMorfFilter_t nppMorfFilter_callers[2][5] = 
     {
@@ -408,11 +373,12 @@ Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int op, int type, const Gpu
  
     CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);   
     CV_Assert(type == CV_8UC1 || type == CV_8UC4); 
-    CV_Assert(kernel.type() == CV_8UC1 && kernel.rows == 1 && kernel.cols == ksize.area());
-    
+        
+    GpuMat gpu_krnl;
+    normalizeKernel(kernel, gpu_krnl);
     normalizeAnchor(anchor, ksize);
     
-    return Ptr<BaseFilter_GPU>(new NPPMorphFilter(ksize, anchor, kernel, nppMorfFilter_callers[op][CV_MAT_CN(type)]));
+    return Ptr<BaseFilter_GPU>(new NPPMorphFilter(ksize, anchor, gpu_krnl, nppMorfFilter_callers[op][CV_MAT_CN(type)]));
 }
 
 namespace
@@ -447,10 +413,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, cons
 
     Size ksize = kernel.size();
 
-    GpuMat gpu_krnl;
-    normalizeKernel(kernel, gpu_krnl);
-
-    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, gpu_krnl, ksize, anchor);
+    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);
 
     return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, iterations));
 }
@@ -575,27 +538,25 @@ namespace
     };
 }
 
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const GpuMat& kernel, const Size& ksize, Point anchor, int nDivisor)
+Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize, Point anchor)
 {
     static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R};
 
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType); 
-    CV_Assert(kernel.type() == CV_32SC1 && kernel.rows == 1 && kernel.cols == ksize.area());
-
+    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
+    
+    GpuMat gpu_krnl;
+    int nDivisor;
+    normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
     normalizeAnchor(anchor, ksize);
 
-    return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, kernel, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
+    return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
 }    
 
 Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Point& anchor)
 {
     Size ksize = kernel.size();
 
-    GpuMat gpu_krnl;
-    int nDivisor;
-    normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
-
-    Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, gpu_krnl, ksize, anchor, nDivisor);
+    Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor);
 
     return createFilter2D_GPU(linearFilter);
 }
@@ -614,11 +575,26 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Separable Linear Filter
 
+namespace cv { namespace gpu { namespace filters
+{
+    void linearRowFilter_gpu_32s32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearRowFilter_gpu_32s32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearRowFilter_gpu_32f32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearRowFilter_gpu_32f32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+
+    void linearColumnFilter_gpu_32s32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearColumnFilter_gpu_32s32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearColumnFilter_gpu_32f32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearColumnFilter_gpu_32f32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+}}}
+
 namespace
 {
     typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, 
         const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);
 
+    typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+
     class NppLinearRowFilter : public BaseRowFilter_GPU
     {
     public:
@@ -638,20 +614,64 @@ namespace
         Npp32s nDivisor;
         nppFilter1D_t func;
     };
+
+    class GpuLinearRowFilter : public BaseRowFilter_GPU
+    {
+    public:
+        GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_) : 
+            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}
+
+        virtual void operator()(const GpuMat& src, GpuMat& dst)
+        {
+            func(src, dst, kernel.ptr<float>(), ksize, anchor);
+        }
+
+        Mat kernel;
+        gpuFilter1D_t func;
+    };
 }
 
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const GpuMat& rowKernel, int anchor, int nDivisor)
+Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor)
 {
+    using namespace cv::gpu::filters;
     static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};
+    static const gpuFilter1D_t gpuFilter1D_callers[6][6] =
+    {
+        {0,0,0,0,0,0},
+        {0,0,0,0,0,0},
+        {0,0,0,0,0,0},
+        {0,0,0,0,0,0},
+        {0,0,0,0,linearRowFilter_gpu_32s32s, linearRowFilter_gpu_32s32f},
+        {0,0,0,0,linearRowFilter_gpu_32f32s, linearRowFilter_gpu_32f32f}
+    };
+    
+    if ((srcType == CV_8UC1 || srcType == CV_8UC4) && bufType == srcType)
+    {
+        GpuMat gpu_row_krnl;
+        int nDivisor;
+        normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nDivisor, true);
 
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && bufType == srcType);
-    CV_Assert(rowKernel.type() == CV_32SC1 && rowKernel.rows == 1);
+        int ksize = gpu_row_krnl.cols;
+        normalizeAnchor(anchor, ksize);
 
-    int ksize = rowKernel.cols;
+        return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor,
+            nppFilter1D_callers[CV_MAT_CN(srcType)]));
+    }
+    else if ((srcType == CV_32SC1 || srcType == CV_32FC1) && (bufType == CV_32SC1 || bufType == CV_32FC1))
+    {
+        Mat temp(rowKernel.size(), CV_32FC1);
+        rowKernel.convertTo(temp, CV_32FC1);
+        Mat cont_krnl = temp.reshape(1, 1);
 
-    normalizeAnchor(anchor, ksize);
+        int ksize = cont_krnl.cols;
+        normalizeAnchor(anchor, ksize);
 
-    return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, rowKernel, nDivisor, nppFilter1D_callers[CV_MAT_CN(srcType)]));
+        return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, cont_krnl, 
+            gpuFilter1D_callers[CV_MAT_DEPTH(srcType)][CV_MAT_DEPTH(bufType)]));
+    }
+
+    CV_Assert(!"Unsupported types");  
+    return Ptr<BaseRowFilter_GPU>(0);
 }
 
 namespace
@@ -675,49 +695,88 @@ namespace
         Npp32s nDivisor;
         nppFilter1D_t func;
     };
+
+    class GpuLinearColumnFilter : public BaseColumnFilter_GPU
+    {
+    public:
+        GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_) : 
+            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}
+
+        virtual void operator()(const GpuMat& src, GpuMat& dst)
+        {
+            func(src, dst, kernel.ptr<float>(), ksize, anchor);
+        }
+
+        Mat kernel;
+        gpuFilter1D_t func;
+    };
 }
 
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const GpuMat& columnKernel, int anchor, int nDivisor)
+Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor)
 {
+    using namespace cv::gpu::filters;
     static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R};
+    static const gpuFilter1D_t gpuFilter1D_callers[6][6] =
+    {
+        {0,0,0,0,0,0},
+        {0,0,0,0,0,0},
+        {0,0,0,0,0,0},
+        {0,0,0,0,0,0},
+        {0,0,0,0,linearColumnFilter_gpu_32s32s, linearColumnFilter_gpu_32s32f},
+        {0,0,0,0,linearColumnFilter_gpu_32f32s, linearColumnFilter_gpu_32f32f}
+    };
+    
+    if ((bufType == CV_8UC1 || bufType == CV_8UC4) && dstType == bufType)
+    {
+        GpuMat gpu_col_krnl;
+        int nDivisor;
+        normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nDivisor, true);
 
-    CV_Assert((bufType == CV_8UC1 || bufType == CV_8UC4) && dstType == bufType);
-    CV_Assert(columnKernel.type() == CV_32SC1 && columnKernel.rows == 1);
+        int ksize = gpu_col_krnl.cols;
+        normalizeAnchor(anchor, ksize);
 
-    int ksize = columnKernel.cols;
+        return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor, 
+            nppFilter1D_callers[CV_MAT_CN(bufType)]));
+    }
+    else if ((bufType == CV_32SC1 || bufType == CV_32FC1) && (dstType == CV_32SC1 || dstType == CV_32FC1))
+    {
+        Mat temp(columnKernel.size(), CV_32FC1);
+        columnKernel.convertTo(temp, CV_32FC1);
+        Mat cont_krnl = temp.reshape(1, 1);
 
-    normalizeAnchor(anchor, ksize);
+        int ksize = cont_krnl.cols;
+        normalizeAnchor(anchor, ksize);
 
-    return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, columnKernel, nDivisor, nppFilter1D_callers[CV_MAT_CN(bufType)]));
+        return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, cont_krnl, 
+            gpuFilter1D_callers[CV_MAT_DEPTH(bufType)][CV_MAT_DEPTH(dstType)]));
+    }
+
+    CV_Assert(!"Unsupported types");  
+    return Ptr<BaseColumnFilter_GPU>(0);
 }
 
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, 
-    const Point& anchor, bool rowFilterFirst)
+    const Point& anchor)
 {
     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
     int cn = CV_MAT_CN(srcType);
     int bdepth = std::max(sdepth, ddepth);
     int bufType = CV_MAKETYPE(bdepth, cn);
 
-    GpuMat gpu_row_krnl, gpu_col_krnl;
-    int nRowDivisor, nColDivisor;
-    normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nRowDivisor, true);
-    normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nColDivisor, true);
+    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x);
+    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y);
 
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, gpu_row_krnl, anchor.x, nRowDivisor);
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, gpu_col_krnl, anchor.y, nColDivisor);
-
-    return createSeparableFilter_GPU(rowFilter, columnFilter, rowFilterFirst);
+    return createSeparableFilter_GPU(rowFilter, columnFilter);
 }
 
-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor, bool rowFilterFirst)
+void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor)
 {
     if( ddepth < 0 )
         ddepth = src.depth();
 
     dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
 
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowFilterFirst);
+    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor);
     f->apply(src, dst);
 }
 
@@ -728,7 +787,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, i
 {
     Mat kx, ky;
     getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), dx >= dy);
+    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1));
 }
 
 void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale)
@@ -746,7 +805,7 @@ void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
             ky *= scale;
     }
     
-    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), dx >= dy);
+    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1));
 }
 
 void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale)
@@ -764,7 +823,7 @@ void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
             ky *= scale;
     }
 
-    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), dx >= dy);
+    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1));
 }
 
 void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale)
diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp
index 73b44498e0..c60e605991 100644
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -75,7 +75,7 @@ void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*) { throw_nogpu();
 
 namespace cv { namespace gpu 
 { 
-    namespace improc 
+    namespace imgproc 
     {
         void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
         void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
@@ -142,7 +142,7 @@ namespace cv { namespace gpu
 void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap)
 {
     typedef void (*remap_gpu_t)(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
-    static const remap_gpu_t callers[] = {improc::remap_gpu_1c, 0, improc::remap_gpu_3c};
+    static const remap_gpu_t callers[] = {imgproc::remap_gpu_1c, 0, imgproc::remap_gpu_3c};
 
     CV_Assert((src.type() == CV_8U || src.type() == CV_8UC3) && xmap.type() == CV_32F && ymap.type() == CV_32F);
 
@@ -180,7 +180,7 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
         eps = 1.f;
     eps = (float)std::max(criteria.epsilon, 0.0);        
 
-    improc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);    
+    imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);    
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -207,7 +207,7 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
         eps = 1.f;
     eps = (float)std::max(criteria.epsilon, 0.0);        
 
-    improc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);    
+    imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);    
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -223,7 +223,7 @@ namespace
             out = dst;
         out.create(src.size(), CV_8UC4);
 
-        improc::drawColorDisp_gpu((DevMem2D_<T>)src, out, ndisp, stream);
+        imgproc::drawColorDisp_gpu((DevMem2D_<T>)src, out, ndisp, stream);
 
         dst = out;
     }
@@ -256,7 +256,7 @@ namespace
     void reprojectImageTo3D_caller(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream)
     {        
         xyzw.create(disp.rows, disp.cols, CV_32FC4);
-        improc::reprojectImageTo3D_gpu((DevMem2D_<T>)disp, xyzw, Q.ptr<float>(), stream);
+        imgproc::reprojectImageTo3D_gpu((DevMem2D_<T>)disp, xyzw, Q.ptr<float>(), stream);
     }
     
     typedef void (*reprojectImageTo3D_caller_t)(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream);
@@ -313,7 +313,7 @@ namespace
             case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:                
                 {
                     typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::RGB2RGB_gpu_8u, 0, improc::RGB2RGB_gpu_16u, 0, 0, improc::RGB2RGB_gpu_32f};
+                    static const func_t funcs[] = {imgproc::RGB2RGB_gpu_8u, 0, imgproc::RGB2RGB_gpu_16u, 0, 0, imgproc::RGB2RGB_gpu_32f};
 
                     CV_Assert(scn == 3 || scn == 4);
 
@@ -338,7 +338,7 @@ namespace
 
                     dst.create(sz, CV_8UC2);
 
-                    improc::RGB2RGB5x5_gpu(src, scn, dst, green_bits, bidx, stream);
+                    imgproc::RGB2RGB5x5_gpu(src, scn, dst, green_bits, bidx, stream);
                     break;
                 }
             
@@ -356,14 +356,14 @@ namespace
 
                     dst.create(sz, CV_MAKETYPE(depth, dcn));
 
-                    improc::RGB5x52RGB_gpu(src, green_bits, dst, dcn, bidx, stream);
+                    imgproc::RGB5x52RGB_gpu(src, green_bits, dst, dcn, bidx, stream);
                     break;
                 }
                         
             case CV_BGR2GRAY: case CV_BGRA2GRAY: case CV_RGB2GRAY: case CV_RGBA2GRAY:
                 {
                     typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::RGB2Gray_gpu_8u, 0, improc::RGB2Gray_gpu_16u, 0, 0, improc::RGB2Gray_gpu_32f};
+                    static const func_t funcs[] = {imgproc::RGB2Gray_gpu_8u, 0, imgproc::RGB2Gray_gpu_16u, 0, 0, imgproc::RGB2Gray_gpu_32f};
 
                     CV_Assert(scn == 3 || scn == 4);
                     
@@ -383,14 +383,14 @@ namespace
 
                     dst.create(sz, CV_8UC1);
 
-                    improc::RGB5x52Gray_gpu(src, green_bits, dst, stream);
+                    imgproc::RGB5x52Gray_gpu(src, green_bits, dst, stream);
                     break;
                 }
             
             case CV_GRAY2BGR: case CV_GRAY2BGRA:
                 {
                     typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::Gray2RGB_gpu_8u, 0, improc::Gray2RGB_gpu_16u, 0, 0, improc::Gray2RGB_gpu_32f};
+                    static const func_t funcs[] = {imgproc::Gray2RGB_gpu_8u, 0, imgproc::Gray2RGB_gpu_16u, 0, 0, imgproc::Gray2RGB_gpu_32f};
 
                     if (dcn <= 0) dcn = 3;
 
@@ -410,7 +410,7 @@ namespace
 
                     dst.create(sz, CV_8UC2);
                     
-                    improc::Gray2RGB5x5_gpu(src, dst, green_bits, stream);
+                    imgproc::Gray2RGB5x5_gpu(src, dst, green_bits, stream);
                     break;
                 }
 
@@ -419,7 +419,7 @@ namespace
                 {
                     typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
                         const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::RGB2YCrCb_gpu_8u, 0, improc::RGB2YCrCb_gpu_16u, 0, 0, improc::RGB2YCrCb_gpu_32f};
+                    static const func_t funcs[] = {imgproc::RGB2YCrCb_gpu_8u, 0, imgproc::RGB2YCrCb_gpu_16u, 0, 0, imgproc::RGB2YCrCb_gpu_32f};
 
                     if (dcn <= 0) dcn = 3;
                     CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4));
@@ -456,7 +456,7 @@ namespace
                 {
                     typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
                         const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::YCrCb2RGB_gpu_8u, 0, improc::YCrCb2RGB_gpu_16u, 0, 0, improc::YCrCb2RGB_gpu_32f};
+                    static const func_t funcs[] = {imgproc::YCrCb2RGB_gpu_8u, 0, imgproc::YCrCb2RGB_gpu_16u, 0, 0, imgproc::YCrCb2RGB_gpu_32f};
 
                     if (dcn <= 0) dcn = 3;
 
@@ -485,7 +485,7 @@ namespace
                 {
                     typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, 
                         const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::RGB2XYZ_gpu_8u, 0, improc::RGB2XYZ_gpu_16u, 0, 0, improc::RGB2XYZ_gpu_32f};
+                    static const func_t funcs[] = {imgproc::RGB2XYZ_gpu_8u, 0, imgproc::RGB2XYZ_gpu_16u, 0, 0, imgproc::RGB2XYZ_gpu_32f};
 
                     if (dcn <= 0) dcn = 3;
 
@@ -534,7 +534,7 @@ namespace
                 {
                     typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, 
                         const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::XYZ2RGB_gpu_8u, 0, improc::XYZ2RGB_gpu_16u, 0, 0, improc::XYZ2RGB_gpu_32f};
+                    static const func_t funcs[] = {imgproc::XYZ2RGB_gpu_8u, 0, imgproc::XYZ2RGB_gpu_16u, 0, 0, imgproc::XYZ2RGB_gpu_32f};
 
                     if (dcn <= 0) dcn = 3;
 
@@ -584,8 +584,8 @@ namespace
                 {
                     typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
                         int hrange, cudaStream_t stream);
-                    static const func_t funcs_hsv[] = {improc::RGB2HSV_gpu_8u, 0, 0, 0, 0, improc::RGB2HSV_gpu_32f};
-                    static const func_t funcs_hls[] = {improc::RGB2HLS_gpu_8u, 0, 0, 0, 0, improc::RGB2HLS_gpu_32f};
+                    static const func_t funcs_hsv[] = {imgproc::RGB2HSV_gpu_8u, 0, 0, 0, 0, imgproc::RGB2HSV_gpu_32f};
+                    static const func_t funcs_hls[] = {imgproc::RGB2HLS_gpu_8u, 0, 0, 0, 0, imgproc::RGB2HLS_gpu_32f};
 
                     if (dcn <= 0) dcn = 3;
 
@@ -610,8 +610,8 @@ namespace
                 {
                     typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
                         int hrange, cudaStream_t stream);
-                    static const func_t funcs_hsv[] = {improc::HSV2RGB_gpu_8u, 0, 0, 0, 0, improc::HSV2RGB_gpu_32f};
-                    static const func_t funcs_hls[] = {improc::HLS2RGB_gpu_8u, 0, 0, 0, 0, improc::HLS2RGB_gpu_32f};
+                    static const func_t funcs_hsv[] = {imgproc::HSV2RGB_gpu_8u, 0, 0, 0, 0, imgproc::HSV2RGB_gpu_32f};
+                    static const func_t funcs_hls[] = {imgproc::HLS2RGB_gpu_8u, 0, 0, 0, 0, imgproc::HLS2RGB_gpu_32f};
 
                     if (dcn <= 0) dcn = 3;
 
diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp
index 7d58619b28..814c79c6b4 100644
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -77,6 +77,22 @@ namespace cv
 
 #else /* !defined (HAVE_CUDA) */
 
+namespace cv 
+{
+    namespace gpu
+    {
+        namespace matrix_operations
+        {            
+            void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+
+            void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
+            void set_to_with_mask    (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+
+            void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream = 0);
+        }
+    }
+}
+
 void cv::gpu::GpuMat::upload(const Mat& m)
 {
     CV_DbgAssert(!m.empty());
diff --git a/tests/gpu/src/gputest_main.cpp b/tests/gpu/src/gputest_main.cpp
index ebcbf2c65e..12d62a8c15 100644
--- a/tests/gpu/src/gputest_main.cpp
+++ b/tests/gpu/src/gputest_main.cpp
@@ -53,7 +53,6 @@ const char* blacklist[] =
     //"GPU-NppImageMeanStdDev",       // different precision
     //"GPU-NppImageExp",              // different precision
     //"GPU-NppImageLog",              // different precision
-    //"GPU-NppImageMagnitude",        // different precision
     
     "GPU-NppImageCanny",            // NPP_TEXTURE_BIND_ERROR
     //"GPU-NppImageResize",           // different precision
@@ -61,8 +60,8 @@ const char* blacklist[] =
     //"GPU-NppImageWarpPerspective",  // different precision
     //"GPU-NppImageIntegral",         // different precision
     
-    //"GPU-NppImageSobel",            // ???
-    //"GPU-NppImageScharr",           // ???    
+    //"GPU-NppImageSobel",            // sign error
+    //"GPU-NppImageScharr",           // sign error    
     //"GPU-NppImageGaussianBlur",     // different precision 
     0
 };