diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index a2506f1ee2..83fbd60c5f 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -917,8 +917,11 @@ CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTra
                          GpuMat& labels,
                          GpuMat& buf, Stream& stream = Stream::Null());
 
+//! compute mask for Generalized Flood fill componetns labeling.
+CV_EXPORTS void connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& stream = Stream::Null());
+
 //! performs connected componnents labeling.
-CV_EXPORTS void labelComponents(const GpuMat& image, GpuMat& mask, GpuMat& components, const cv::Scalar& lo, const cv::Scalar& hi, Stream& stream = Stream::Null());
+CV_EXPORTS void labelComponents(const GpuMat& mask, GpuMat& components, Stream& stream = Stream::Null());
 
 ////////////////////////////////// Histograms //////////////////////////////////
 
diff --git a/modules/gpu/src/cuda/ccomponetns.cu b/modules/gpu/src/cuda/ccomponetns.cu
index fbc68cbcf3..9b8d4c9925 100644
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@@ -65,32 +65,108 @@ namespace cv { namespace gpu { namespace device
             TILE_ROWS = CTA_SIZE_Y * TPB_Y
         };
 
+        template<typename T> struct IntervalsTraits
+        {
+            typedef T elem_type;
+        };
+
+        template<> struct IntervalsTraits<unsigned char>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<uchar3>
+        {
+            typedef int3 dist_type;
+            enum {ch = 3};
+        };
+
+        template<> struct IntervalsTraits<uchar4>
+        {
+            typedef int3 dist_type;
+            enum {ch = 4};
+        };
+
+        template<> struct IntervalsTraits<unsigned short>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<ushort3>
+        {
+            typedef int3 dist_type;
+            enum {ch = 3};
+        };
+
+        template<> struct IntervalsTraits<ushort4>
+        {
+            typedef int4 dist_type;
+            enum {ch = 4};
+        };
+
+        template<> struct IntervalsTraits<float>
+        {
+            typedef float dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<int>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
         typedef unsigned char component;
         enum Edges { UP = 1, DOWN = 2, LEFT = 4, RIGHT = 8, EMPTY = 0xF0 };
 
-        template<typename T>
-        struct InInterval
+        template<typename T, int CH> struct InInterval {};
+
+        template<typename T> struct InInterval<T, 1>
         {
-            __host__ __device__ __forceinline__ InInterval(const T& _lo, const T& _hi) : lo(-_lo), hi(_hi) {};
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo(-_lo.x), hi(_hi.x) {};
             T lo, hi;
 
-            __device__ __forceinline__ bool operator() (const T& a, const T& b) const
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
             {
                 T d = a - b;
                 return lo <= d && d <= hi;
             }
-
         };
 
-        template<typename F>
-        __global__ void computeConnectivity(const DevMem2D image, DevMem2D components, F connected)
+        template<typename T> struct InInterval<T, 3>
+        {
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi){};
+            T lo, hi;
+
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
+            {
+                return true;
+            }
+        };
+
+        template<typename T> struct InInterval<T, 4>
+        {
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi){};
+            T lo, hi;
+
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
+            {
+                return true;
+            }
+        };
+
+
+        template<typename T, typename F>
+        __global__ void computeConnectivity(const DevMem2D_<T> image, DevMem2D components, F connected)
         {
             int x = threadIdx.x + blockIdx.x * blockDim.x;
             int y = threadIdx.y + blockIdx.y * blockDim.y;
 
             if (x >= image.cols || y >= image.rows) return;
 
-            int intensity = image(y, x);
+            T intensity = image(y, x);
             component c = 0;
 
             if ( x > 0 && connected(intensity, image(y, x - 1)))
@@ -108,18 +184,31 @@ namespace cv { namespace gpu { namespace device
             components(y, x) = c;
         }
 
-        void computeEdges(const DevMem2D& image, DevMem2D edges, const int lo, const int hi, cudaStream_t stream)
+        template< typename T>
+        void computeEdges(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream)
         {
             dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
             dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
-            InInterval<int> inInt(lo, hi);
-            computeConnectivity<InInterval<int> ><<<grid, block, 0, stream>>>(image, edges, inInt);
+
+            typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;
+
+            Int_t inInt(lo, hi);
+            computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const DevMem2D_<T> >(image), edges, inInt);
 
             cudaSafeCall( cudaGetLastError() );
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
 
+        template void computeEdges<uchar>  (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar3> (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar4> (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort> (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort3>(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort4>(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<int>    (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<float>  (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+
         __global__ void lableTiles(const DevMem2D edges, DevMem2Di comps)
         {
             int x = threadIdx.x + blockIdx.x * TILE_COLS;
diff --git a/modules/gpu/src/graphcuts.cpp b/modules/gpu/src/graphcuts.cpp
index 3ea9d3bc2b..dd8cf8e59b 100644
--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@@ -47,7 +47,8 @@
 void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 
-void cv::gpu::labelComponents(const GpuMat&, GpuMat&, GpuMat&, const cv::Scalar&, const cv::Scalar&, Stream&) { throw_nogpu(); }
+void cv::gpu::connectivityMask(const GpuMat&, GpuMat&, const cv::Scalar&, const cv::Scalar&, Stream&) { throw_nogpu(); }
+void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, Stream& stream) { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -56,29 +57,65 @@ namespace cv { namespace gpu { namespace device
     namespace ccl
     {
         void labelComponents(const DevMem2D& edges, DevMem2Di comps, cudaStream_t stream);
-        void computeEdges(const DevMem2D& image, DevMem2D edges, const int lo, const int hi, cudaStream_t stream);
+
+        template<typename T>
+        void computeEdges(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
     }
 }}}
 
-void cv::gpu::labelComponents(const GpuMat& image, GpuMat& mask, GpuMat& components, const cv::Scalar& lo, const cv::Scalar& hi, Stream& s)
+
+float4 scalarToCudaType(const cv::Scalar& in)
+{
+    float4 res;
+    res.x = in[0]; res.y = in[1]; res.z = in[2]; res.w = in[3];
+    return res;
+}
+
+
+void cv::gpu::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& s)
 {
     CV_Assert(!image.empty());
 
-    int type = image.type();
-    CV_Assert(type == CV_8UC1);
+    int ch = image.channels();
+    CV_Assert(ch <= 4);
+
+    int depth = image.depth();
+
+    typedef void (*func_t)(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+
+    static const func_t suppotLookup[8][4] =
+    {   //    1,    2,     3,     4
+        { device::ccl::computeEdges<uchar>,  0,  device::ccl::computeEdges<uchar3>,  device::ccl::computeEdges<uchar4>  },// CV_8U
+        { 0,                                 0,  0,                                  0                                  },// CV_16U
+        { device::ccl::computeEdges<ushort>, 0,  device::ccl::computeEdges<ushort3>, device::ccl::computeEdges<ushort4> },// CV_8S
+        { 0,                                 0,  0,                                  0                                  },// CV_16S
+        { device::ccl::computeEdges<int>,    0,  0,                                  0                                  },// CV_32S
+        { device::ccl::computeEdges<float>,  0,  0,                                  0                                  },// CV_32F
+        { 0,                                 0,  0,                                  0                                  },// CV_64F
+        { 0,                                 0,  0,                                  0                                  } // CV_USRTYPE1
+    };
+
+    func_t f = suppotLookup[depth][ch - 1];
+    CV_Assert(f);
 
     if (image.size() != mask.size() || mask.type() != CV_8UC1)
         mask.create(image.size(), CV_8UC1);
 
-    if (image.size() != components.size() || components.type() != CV_32SC1)
-        components.create(image.size(), CV_32SC1);
-
     cudaStream_t stream = StreamAccessor::getStream(s);
-
-    device::ccl::computeEdges(image, mask, lo[0], hi[0], stream);
-    device::ccl::labelComponents(mask, components, stream);
+    float4 culo = scalarToCudaType(lo), cuhi = scalarToCudaType(hi);
+    f(image, mask, culo, cuhi, stream);
 }
 
+void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, Stream& s)
+{
+    CV_Assert(!mask.empty() && mask.type() == CV_8U);
+
+    if (mask.size() != components.size() || components.type() != CV_32SC1)
+        components.create(mask.size(), CV_32SC1);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    device::ccl::labelComponents(mask, components, stream);
+}
 
 namespace
 {
diff --git a/modules/gpu/test/test_labeling.cpp b/modules/gpu/test/test_labeling.cpp
index 2ff010de2a..6ba0ef5483 100644
--- a/modules/gpu/test/test_labeling.cpp
+++ b/modules/gpu/test/test_labeling.cpp
@@ -70,7 +70,9 @@ TEST_P(Labeling, ConnectedComponents)
     cv::gpu::GpuMat components;
     components.create(image.rows, image.cols, CV_32SC1);
 
-    cv::gpu::labelComponents(cv::gpu::GpuMat(image), mask, components, cv::Scalar::all(0), cv::Scalar::all(2));
+    cv::gpu::connectivityMask(cv::gpu::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+    cv::gpu::labelComponents(mask, components);
 
     // std::cout << cv::Mat(components) << std::endl;
     // cv::imshow("test", image);