diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 53a5404d87..0954c7a5c2 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1464,6 +1464,7 @@ private:
     GpuMat resuzeBuffer;
 
 	GpuMat candidates;
+    static const int integralFactor = 4;
 };
 
 ////////////////////////////////// SURF //////////////////////////////////////////
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index 644ce6e5c9..c1ccf61b60 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -67,7 +67,7 @@ cv::gpu::CascadeClassifier_GPU_LBP::~CascadeClassifier_GPU_LBP()
 bool cv::gpu::CascadeClassifier_GPU_LBP::empty() const                               { throw_nogpu(); return true; }
 bool cv::gpu::CascadeClassifier_GPU_LBP::load(const string&)                         { throw_nogpu(); return true; }
 Size cv::gpu::CascadeClassifier_GPU_LBP::getClassifierSize() const                   { throw_nogpu(); return Size(); }
-void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size /*frame*/)       { throw_nogpu();}
+void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size /*frame*/)         { throw_nogpu();}
 
 int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*objectsBuf*/,
 double /*scaleFactor*/, int /*minNeighbors*/, cv::Size /*maxObjectSize*/){ throw_nogpu(); return 0;}
@@ -86,7 +86,7 @@ void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size frame)
     {
         resuzeBuffer.create(frame, CV_8UC1);
 
-        integral.create(frame.height + 1, frame.width + 1, CV_32SC1);
+        integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
         NcvSize32u roiSize;
         roiSize.width = frame.width;
         roiSize.height = frame.height;
@@ -284,14 +284,83 @@ namespace cv { namespace gpu { namespace device
                                 DevMem2D_<int4> objects,
                                 unsigned int* classified);
 
+        void classifyPyramid(int frameW,
+                             int frameH,
+                             int windowW,
+                             int windowH,
+                             float initalScale,
+                             float factor,
+                             int total,
+                             const DevMem2Db& mstages,
+                             const int nstages,
+                             const DevMem2Di& mnodes,
+                             const DevMem2Df& mleaves,
+                             const DevMem2Di& msubsets,
+                             const DevMem2Db& mfeatures,
+                             const int subsetSize,
+                             DevMem2D_<int4> objects,
+                             unsigned int* classified,
+                             DevMem2Di integral);
+
         void connectedConmonents(DevMem2D_<int4>  candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
         void bindIntegral(DevMem2Di integral);
         void unbindIntegral();
     }
 }}}
 
-int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects,
-                                                        double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/)
+cv::Size operator -(const cv::Size& a, const cv::Size& b)
+{
+    return cv::Size(a.width - b.width, a.height - b.height);
+}
+
+cv::Size operator +(const cv::Size& a, const int& i)
+{
+    return cv::Size(a.width + i, a.height + i);
+}
+
+cv::Size operator *(const cv::Size& a, const float& f)
+{
+    return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
+}
+
+cv::Size operator /(const cv::Size& a, const float& f)
+{
+    return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
+}
+
+bool operator <=(const cv::Size& a, const cv::Size& b)
+{
+    return a.width <= b.width && a.height <= b.width;
+}
+
+struct PyrLavel
+{
+    PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window) : order(_order)
+    {
+        scale = pow(_scale, order);
+        sFrame = frame / scale;
+        workArea = sFrame - window + 1;
+        sWindow = window * scale;
+    }
+
+    bool isFeasible(cv::Size maxObj)
+    {
+        return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj;
+    }
+
+    PyrLavel next(float factor, cv::Size frame, cv::Size window)
+    {
+        return PyrLavel(order + 1, factor, frame, window);
+    }
+
+    int order;
+    float scale;
+    cv::Size sFrame;
+    cv::Size workArea;
+    cv::Size sWindow;
+};
+
+int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects, double scaleFactor, int groupThreshold, cv::Size maxObjectSize)
 {
     CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U);
 
@@ -306,6 +375,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
     // used for debug
     // candidates.setTo(cv::Scalar::all(0));
     // objects.setTo(cv::Scalar::all(0));
+
     if (maxObjectSize == cv::Size())
         maxObjectSize = image.size();
 
@@ -315,52 +385,54 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
     GpuMat dclassified(1, 1, CV_32S);
     cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );
 
-    // cv::gpu::device::lbp::bindIntegral(integral);
+    PyrLavel level(0, 1.0f, image.size(), NxM);
 
-    Size scaledImageSize(image.cols, image.rows);
-    Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
-    Size windowSize(NxM.width, NxM.height);
-
-    float factor = 1;
-
-    for (;;)
+    while (level.isFeasible(maxObjectSize))
     {
-        if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )
-            break;
+        int acc = level.sFrame.width + 1;
+        float iniScale = level.scale;
+        cv::Size area = level.workArea;
+        float step = (float)(1 + (level.scale <= 2.f));
 
-        if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )
-            break;
+        int total = 0, prev  = 0;
 
-        // if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
-        //     continue;
+        while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize))
+        {
+            // create sutable matrix headers
+            GpuMat src  = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height));
+            GpuMat sint = integral(cv::Rect(prev, 0, level.sFrame.width + 1, level.sFrame.height + 1));
+            GpuMat buff = integralBuffer;
 
-        GpuMat scaledImg      = resuzeBuffer(cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
-        GpuMat scaledIntegral = integral(cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));
-        GpuMat currBuff = integralBuffer;
+            // generate integral for scale
+            gpu::resize(image, src, level.sFrame, 0, 0, CV_INTER_LINEAR);
+            gpu::integralBuffered(src, sint, buff);
 
-        gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);
-        gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);
+            total += cvCeil(area.width / step) * cvCeil(area.height / step);
+            // std::cout << "Total for scale: " << total <<  " this step contribution " <<  cvCeil(area.width / step) * cvCeil(area.height / step) << " previous width shift " << prev << " acc " <<  acc << " scales: " << cvCeil(area.width / step) << std::endl;
 
-        int step = factor <= 2.f ? 2 : 1;
+            // increment pyr lavel
+            level = level.next(scaleFactor, image.size(), NxM);
+            area = level.workArea;
 
-        device::lbp::classifyStumpFixed(integral, integral.step1(), stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
-            processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified.ptr<unsigned int>());
+            step =  (float)(1 + (level.scale <= 2.f));
+            prev = acc;
+            acc += level.sFrame.width + 1;
+        }
 
-        factor *= scaleFactor;
-        windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
-        scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
-        processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
+        device::lbp::classifyPyramid(image.cols, image.rows, NxM.width, NxM.height, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
+            leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);
     }
 
-    // cv::gpu::device::lbp::unbindIntegral();
     if (groupThreshold <= 0  || objects.empty())
         return 0;
 
     cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
     device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());
 
+    // candidates.copyTo(objects);
     cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
     cudaSafeCall( cudaDeviceSynchronize() );
+    // std::cout << classified << " !!!!!!!!!!" <<  std::endl;
 
     return classified;
 }
diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu
index 42ddd036d2..2667167fbe 100644
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@@ -216,10 +216,10 @@ namespace cv { namespace gpu { namespace device
 
         struct Classifier
         {
-            __host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves, 
-				const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize)
-            : integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), 
-			  clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){}
+            __host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves,
+                const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize)
+            : integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages),
+              clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){}
 
             __device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
             {
@@ -255,11 +255,7 @@ namespace cv { namespace gpu { namespace device
                 rect.z = clWidth;
                 rect.w = clHeight;
 
-#if (__CUDA_ARCH__ < 120)
-                int res = __atomicInc(n, maxN);
-#else
-                int res = atomicInc(n, maxN);
-#endif
+                int res = Emulation::smem::atomicInc(n, maxN);
                 objects(0, res) = rect;
             }
 
@@ -317,26 +313,17 @@ namespace cv { namespace gpu { namespace device
             __syncthreads();
 
             int cls = labels[tid];
-#if (__CUDA_ARCH__ < 120)
-            __atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
-            __atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
-            __atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
-            __atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
-#else
-            atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
-            atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
-            atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
-            atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
-#endif
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
+
             __syncthreads();
             labels[tid] = 0;
 
             __syncthreads();
-#if (__CUDA_ARCH__ < 120)
-            __atomicInc((unsigned int*)labels + cls, n);
-#else
-            atomicInc((unsigned int*)labels + cls, n);
-#endif
+            Emulation::smem::atomicInc((unsigned int*)labels + cls, n);
+
             __syncthreads();
             *nclasses = 0;
 
@@ -354,30 +341,26 @@ namespace cv { namespace gpu { namespace device
 
             if (active && active >= groupThreshold)
             {
-                int* r1 = rrects + tid * 4;                				
-				int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
+                int* r1 = rrects + tid * 4;
+                int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
 
-#if (__CUDA_ARCH__ < 120)
-                objects[__atomicInc(nclasses, n)] = r_out;
-#else
-                int aidx = atomicInc(nclasses, n);
+                int aidx = Emulation::smem::atomicInc(nclasses, n);
                 objects[aidx] = r_out;
-#endif
             }
         }
 
         void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
                            const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
-        {			          			            
-			Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets, 
-				(uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize);
+        {
+            Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets,
+                (uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize);
 
-			int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step); 
+            int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
 
-			int block = 256;
+            int block = 256;
             int grid  = divUp(total, block);
             lbp_classify_stump<<<grid, block>>>(clr, objects, objects.cols, classified, workWidth >> 1);
-			cudaSafeCall( cudaGetLastError() );
+            cudaSafeCall( cudaGetLastError() );
         }
 
         void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
@@ -385,7 +368,124 @@ namespace cv { namespace gpu { namespace device
             int block = ncandidates;
             int smem  = block * ( sizeof(int) + sizeof(int4) );
             disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
-			cudaSafeCall( cudaGetLastError() );
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        struct Cascade
+        {
+            __host__ __device__ __forceinline__ Cascade(const Stage* _stages, int _nstages, const ClNode* _nodes, const float* _leaves,
+                const int* _subsets, const uchar4* _features, int _subsetSize)
+
+            : stages(_stages), nstages(_nstages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), subsetSize(_subsetSize){}
+
+            __device__ __forceinline__ bool operator() (int y, int x, int* integral, const int pitch/*, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n*/) const
+            {
+                int current_node = 0;
+                int current_leave = 0;
+
+                for (int s = 0; s < nstages; ++s)
+                {
+                    float sum = 0;
+                    Stage stage = stages[s];
+                    for (int t = 0; t < stage.ntrees; t++)
+                    {
+                        ClNode node = nodes[current_node];
+                        uchar4 feature = features[node.featureIdx];
+
+                        int shift;
+                        int c = evaluator(integral, (y + feature.y) * pitch + x + feature.x, feature.w * pitch, feature.z, shift);
+                        int idx =  (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
+                        sum += leaves[idx];
+
+                        current_node += 1;
+                        current_leave += 2;
+                    }
+
+                    if (sum < stage.threshold)
+                        return false;
+                }
+
+                return true;
+            }
+
+            const Stage*  stages;
+            const int nstages;
+
+            const ClNode* nodes;
+            const float* leaves;
+            const int* subsets;
+            const uchar4* features;
+
+            const int subsetSize;
+            const LBP evaluator;
+        };
+
+        // stepShift, scale, width_k, sum_prev => y =  sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k
+        __global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor,
+            const int workAmount, int* integral, const int pitch, DevMem2D_<int4> objects, unsigned int* classified)
+        {
+            int ftid = blockIdx.x * blockDim.x + threadIdx.x;
+            if (ftid >= workAmount ) return;
+
+            int sum = 0;
+            // float scale = 1.0f;
+            float stepShift = (scale <= 2.f) ? 2.0 : 1.0;
+            int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift);
+            int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift);
+
+            // if (!ftid)
+                // printf("!!!!: %d %d", w, h);
+
+            int framTid = ftid;
+            int i = 0;
+
+            while (1)
+            {
+                if (framTid < (w - 1) * (h - 1)) break;
+                i++;
+                sum +=  __float2int_rn(frameW / scale) + 1;
+                framTid -= w * h;
+                scale *= factor;
+                stepShift = (scale <= 2.f) ? 2.0 : 1.0;
+                int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift);
+                int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift);
+            }
+
+            int y = (framTid / w);
+            int x = (framTid - y * w) * stepShift;
+            y *= stepShift;
+            x += sum;
+
+            // if (i == 2)
+            // printf("!!!!!!!!!!!!!! %f %d %d %d\n", windowW * scale, sum, y, x);
+
+            if (cascade(y, x, integral, pitch))
+            {
+                int4 rect;
+                rect.x = roundf( (x - sum) * scale);
+                rect.y = roundf(y * scale);
+                rect.z = roundf(windowW * scale);
+                rect.w = roundf(windowH * scale);
+
+                if (rect.x > frameW || rect.y > frameH) return;
+                    // printf("OUTLAUER %d %d %d %d %d %d %d %d %d %f %f\n", x, y, ftid, framTid, rect.x, rect.y, sum, w, h, stepShift, scale);
+
+                // printf("passed: %d %d ---- %d %d %d %d %d\n", y, x, rect.x, rect.y, rect.z, rect.w, sum);
+
+                int res = Emulation::smem::atomicInc(classified, (unsigned int)objects.cols);
+                objects(0, res) = rect;
+
+            }
+        }
+
+        void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount,
+            const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
+            const int subsetSize, DevMem2D_<int4> objects, unsigned int* classified, DevMem2Di integral)
+        {
+            const int block = 256;
+            int grid = divUp(workAmount, block);
+            Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
+            lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), integral.step / sizeof(int), objects, classified);
         }
     }
 }}}
\ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/emulation.hpp b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
index f3923a358f..fe5452b5cd 100644
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
@@ -44,18 +44,19 @@
 #define OPENCV_GPU_EMULATION_HPP_
 
 #include "warp_reduce.hpp"
+#include <stdio.h>
 
 namespace cv { namespace gpu { namespace device
 {
     struct Emulation
     {
-		template<int CTA_SIZE>
+        template<int CTA_SIZE>
         static __forceinline__ __device__ int Ballot(int predicate)
         {
-#if (__CUDA_ARCH__ >= 200) 
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
             return __ballot(predicate);
 #else
-			__shared__ volatile int cta_buffer[CTA_SIZE]
+            __shared__ volatile int cta_buffer[CTA_SIZE];
 
             int tid = threadIdx.x;
             cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
@@ -63,41 +64,62 @@ namespace cv { namespace gpu { namespace device
 #endif
         }
 
-		struct smem
-		{
-			enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
-			
-			template<typename T>
-			static __device__ __forceinline__ T atomicInc(T* address, T val)
-			{
-#if (__CUDA_ARCH__ < 120)
-
-#else
-			
-#endif
-		
-			}
-
-			template<typename T>
-			static __device__ __forceinline__ void atomicAdd(T* address, T val)
-			{
-#if (__CUDA_ARCH__ < 120)
-
-#else
-			
-#endif
-			}
-
-			template<typename T>
-			__device__ __forceinline__ T __atomicMin(T* address, T val)
-			{
-#if (__CUDA_ARCH__ < 120)
-
-#else
-			
-#endif
-			}
-		};
+        struct smem
+        {
+            enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
+
+            template<typename T>
+            static __device__ __forceinline__ T atomicInc(T* address, T val)
+            {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                T count;
+                unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
+                do
+                {
+                    count = *address & TAG_MASK;
+                    count = tag | (count + 1);
+                    *address = count;
+                } while (*address != count);
+
+                return (count & TAG_MASK) - 1;
+#else
+                return ::atomicInc(address, val);
+#endif
+            }
+
+            template<typename T>
+            static __device__ __forceinline__ void atomicAdd(T* address, T val)
+            {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                T count;
+                unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
+                do
+                {
+                    count = *address & TAG_MASK;
+                    count = tag | (count + val);
+                    *address = count;
+                } while (*address != count);
+#else
+                ::atomicAdd(address, val);
+#endif
+            }
+
+            template<typename T>
+            static __device__ __forceinline__ T atomicMin(T* address, T val)
+            {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                T count = min(*address, val);
+                do
+                {
+                    *address = count;
+                } while (*address > count);
+
+                return count;
+#else
+                return ::atomicMin(address, val);
+#endif
+            }
+        };
     };
 }}} // namespace cv { namespace gpu { namespace device
 
diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
index 8a7aa0eb92..0c8a03e334 100644
--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@@ -44,52 +44,11 @@
 #define __OPENCV_GPU_DEVICE_LBP_HPP_
 
 #include "internal_shared.hpp"
+#include <opencv2/gpu/device/emulation.hpp>
 
 namespace cv { namespace gpu { namespace device {
 
-namespace lbp{
-
-    #define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
-	
-	template<typename T>
-	__device__ __forceinline__ T __atomicInc(T* address, T val)
-	{
-		T count;
-		unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
-		do
-		{
-			count = *address & TAG_MASK;
-			count = tag | (count + 1);
-			*address = count;
-		} while (*address != count);
-
-		return (count & TAG_MASK) - 1;
-	}
-
-	template<typename T>
-	__device__ __forceinline__ void __atomicAdd(T* address, T val)
-	{
-		T count;
-		unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
-		do
-		{
-			count = *address & TAG_MASK;
-			count = tag | (count + val);
-			*address = count;
-		} while (*address != count);
-	}
-
-	template<typename T>
-	__device__ __forceinline__ T __atomicMin(T* address, T val)
-	{
-		T count = min(*address, val);
-		do
-		{
-			*address = count;
-		} while (*address > count);
-
-		return count;
-	}
+namespace lbp {
 
     struct Stage
     {
@@ -127,27 +86,25 @@ namespace lbp{
         unsigned tid = threadIdx.x;
         labels[tid] = tid;
         __syncthreads();
-
         for (unsigned int id = 0; id < n; id++)
         {
             if (tid != id && predicate(vec[tid], vec[id]))
             {
                 int p = labels[tid];
                 int q = labels[id];
-
-				if (p != q)
-				{
-					int m = min(p, q);
-#if (__CUDA_ARCH__ < 120)
-                    __atomicMin(labels + id, m);
-#else
-                    atomicMin(labels + id, m);
-#endif
-				}
+                if (p < q)
+                {
+                    Emulation::smem::atomicMin(labels + id, p);
+                }
+                else if (p > q)
+                {
+                    Emulation::smem::atomicMin(labels + tid, q);
+                }
             }
         }
         __syncthreads();
     }
+
 } // lbp
 
 } } }// namespaces