diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index a7f0ab32d9..c6f2e1141e 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -820,6 +820,7 @@ private:
     int nLayers_;
 };
 
+//! HoughLines
 CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
 CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
 CV_EXPORTS void HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta);
diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp
index 9104892db4..979aaa3723 100644
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -1626,7 +1626,7 @@ PERF_TEST_P(Sz_DoSort, ImgProc_HoughLines, Combine(GPU_TYPICAL_MAT_SIZES, Bool()
 
     cv::Mat src(size, CV_8UC1, cv::Scalar::all(0));
 
-    const int numLines = rng.uniform(500, 2000);
+    const int numLines = rng.uniform(100, 300);
     for (int i = 0; i < numLines; ++i)
     {
         cv::Point p1(rng.uniform(0, src.cols), rng.uniform(0, src.rows));
diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu
index 82bd04caad..66433aba7e 100644
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device
         {
             __shared__ int s_queues[4][32 * PIXELS_PER_THREAD];
             __shared__ int s_qsize[4];
-            __shared__ int s_start[4];
+            __shared__ int s_globStart[4];
 
             const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -73,9 +73,10 @@ namespace cv { namespace gpu { namespace device
             __syncthreads();
 
             // fill the queue
+            const uchar* srcRow = src.ptr(y);
             for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
             {
-                if (src(y, xx))
+                if (srcRow[xx])
                 {
                     const unsigned int val = (y << 16) | xx;
                     const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
@@ -89,36 +90,34 @@ namespace cv { namespace gpu { namespace device
             if (threadIdx.x == 0 && threadIdx.y == 0)
             {
                 // find how many items are stored in each list
-                int total_size = 0;
+                int totalSize = 0;
                 for (int i = 0; i < blockDim.y; ++i)
                 {
-                    s_start[i] = total_size;
-                    total_size += s_qsize[i];
+                    s_globStart[i] = totalSize;
+                    totalSize += s_qsize[i];
                 }
 
                 // calculate the offset in the global list
-                const int global_offset = atomicAdd(&g_counter, total_size);
+                const int globalOffset = atomicAdd(&g_counter, totalSize);
                 for (int i = 0; i < blockDim.y; ++i)
-                    s_start[i] += global_offset;
+                    s_globStart[i] += globalOffset;
             }
 
             __syncthreads();
 
             // copy local queues to global queue
             const int qsize = s_qsize[threadIdx.y];
-            for(int i = threadIdx.x; i < qsize; i += blockDim.x)
-            {
-                const unsigned int val = s_queues[threadIdx.y][i];
-                list[s_start[threadIdx.y] + i] = val;
-            }
+            int gidx = s_globStart[threadIdx.y] + threadIdx.x;
+            for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
+                list[gidx] = s_queues[threadIdx.y][i];
         }
 
         int buildPointList_gpu(DevMem2Db src, unsigned int* list)
         {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 
             const dim3 block(32, 4);
             const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
@@ -130,10 +129,10 @@ namespace cv { namespace gpu { namespace device
 
             cudaSafeCall( cudaDeviceSynchronize() );
 
-            int total_count;
-            cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 
-            return total_count;
+            return totalCount;
         }
 
         ////////////////////////////////////////////////////////////////////////
@@ -144,24 +143,26 @@ namespace cv { namespace gpu { namespace device
             const int n = blockIdx.x;
             const float ang = n * theta;
 
-            float sin_ang;
-            float cos_ang;
-            sincosf(ang, &sin_ang, &cos_ang);
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;
 
-            const float tabSin = sin_ang * irho;
-            const float tabCos = cos_ang * irho;
+            const int shift = (numrho - 1) / 2;
 
+            int* accumRow = accum.ptr(n + 1);
             for (int i = threadIdx.x; i < count; i += blockDim.x)
             {
-                const unsigned int qvalue = list[i];
+                const unsigned int val = list[i];
 
-                const int x = (qvalue & 0x0000FFFF);
-                const int y = (qvalue >> 16) & 0x0000FFFF;
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;
 
-                int r = __float2int_rn(x * tabCos + y * tabSin);
-                r += (numrho - 1) / 2;
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;
 
-                ::atomicAdd(accum.ptr(n + 1) + r + 1, 1);
+                ::atomicAdd(accumRow + r + 1, 1);
             }
         }
 
@@ -177,30 +178,32 @@ namespace cv { namespace gpu { namespace device
             const int n = blockIdx.x;
             const float ang = n * theta;
 
-            float sin_ang;
-            float cos_ang;
-            sincosf(ang, &sin_ang, &cos_ang);
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;
 
-            const float tabSin = sin_ang * irho;
-            const float tabCos = cos_ang * irho;
+            const int shift = (numrho - 1) / 2;
 
             for (int i = threadIdx.x; i < count; i += blockDim.x)
             {
-                const unsigned int qvalue = list[i];
+                const unsigned int val = list[i];
 
-                const int x = (qvalue & 0x0000FFFF);
-                const int y = (qvalue >> 16) & 0x0000FFFF;
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;
 
-                int r = __float2int_rn(x * tabCos + y * tabSin);
-                r += (numrho - 1) / 2;
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;
 
                 Emulation::smem::atomicAdd(&smem[r + 1], 1);
             }
 
             __syncthreads();
 
-            for (int i = threadIdx.x; i < numrho; i += blockDim.x)
-                accum(n + 1, i) = smem[i];
+            int* accumRow = accum.ptr(n + 1);
+            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
+                accumRow[i] = smem[i];
         }
 
         void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
@@ -225,21 +228,21 @@ namespace cv { namespace gpu { namespace device
         ////////////////////////////////////////////////////////////////////////
         // linesGetResult
 
-        __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float threshold, const float theta, const float rho, const int numrho)
+        __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const float threshold, const int numrho)
         {
             __shared__ int smem[8][32];
 
-            int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
-            int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y;
+            const int x = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
+            const int y = blockIdx.y * (blockDim.y - 2) + threadIdx.y;
 
-            if (r >= accum.cols || n >= accum.rows)
+            if (x >= accum.cols || y >= accum.rows)
                 return;
 
-            smem[threadIdx.y][threadIdx.x] = accum(n, r);
+            smem[threadIdx.y][threadIdx.x] = accum(y, x);
             __syncthreads();
 
-            r -= 1;
-            n -= 1;
+            const int r = x - 1;
+            const int n = y - 1;
 
             if (threadIdx.x == 0 || threadIdx.x == blockDim.x - 1 || threadIdx.y == 0 || threadIdx.y == blockDim.y - 1 || r >= accum.cols - 2 || n >= accum.rows - 2)
                 return;
@@ -264,32 +267,32 @@ namespace cv { namespace gpu { namespace device
 
         int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort)
         {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 
             const dim3 block(32, 8);
             const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2));
 
-            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, threshold, theta, rho, accum.cols - 2);
+            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
             cudaSafeCall( cudaGetLastError() );
 
             cudaSafeCall( cudaDeviceSynchronize() );
 
-            int total_count;
-            cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 
-            total_count = ::min(total_count, maxSize);
+            totalCount = ::min(totalCount, maxSize);
 
-            if (doSort && total_count > 0)
+            if (doSort && totalCount > 0)
             {
-                thrust::device_ptr<float2> out_ptr(out);
-                thrust::device_ptr<int> votes_ptr(votes);
-                thrust::sort_by_key(votes_ptr, votes_ptr + total_count, out_ptr, thrust::greater<int>());
+                thrust::device_ptr<float2> outPtr(out);
+                thrust::device_ptr<int> votesPtr(votes);
+                thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
             }
 
-            return total_count;
+            return totalCount;
         }
     }
 }}}
diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp
index ba61ad78c8..3b683ff40c 100644
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@@ -57,11 +57,27 @@ namespace cv { namespace gpu { namespace device
     namespace hough
     {
         int buildPointList_gpu(DevMem2Db src, unsigned int* list);
+
         void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
         int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort);
     }
 }}}
 
+//////////////////////////////////////////////////////////
+// HoughLines
+
+void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
+{
+    GpuMat accum, buf;
+    HoughLines(src, lines, accum, buf, rho, theta, threshold, doSort, maxLines);
+}
+
+void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort, int maxLines)
+{
+    HoughLinesTransform(src, accum, buf, rho, theta);
+    HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines);
+}
+
 void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta)
 {
     using namespace cv::gpu::device::hough;
@@ -80,23 +96,23 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf,
     CV_Assert(numangle > 0 && numrho > 0);
 
     ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum);
-    accum.setTo(cv::Scalar::all(0));
+    accum.setTo(Scalar::all(0));
 
-    cv::gpu::DeviceInfo devInfo;
+    DeviceInfo devInfo;
 
     if (count > 0)
-        linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(cv::gpu::FEATURE_SET_COMPUTE_20));
+        linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
 }
 
 void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
 {
-    using namespace cv::gpu::device;
+    using namespace cv::gpu::device::hough;
 
     CV_Assert(accum.type() == CV_32SC1);
 
     ensureSizeIsEnough(2, maxLines, CV_32FC2, lines);
 
-    int count = hough::linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);
+    int count = linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);
 
     if (count > 0)
         lines.cols = count;
@@ -104,18 +120,6 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float
         lines.release();
 }
 
-void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
-{
-    cv::gpu::GpuMat accum, buf;
-    HoughLines(src, lines, accum, buf, rho, theta, threshold, doSort, maxLines);
-}
-
-void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort, int maxLines)
-{
-    HoughLinesTransform(src, accum, buf, rho, theta);
-    HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines);
-}
-
 void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, OutputArray h_votes_)
 {
     if (d_lines.empty())
@@ -129,14 +133,14 @@ void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, Ou
     CV_Assert(d_lines.rows == 2 && d_lines.type() == CV_32FC2);
 
     h_lines_.create(1, d_lines.cols, CV_32FC2);
-    cv::Mat h_lines = h_lines_.getMat();
+    Mat h_lines = h_lines_.getMat();
     d_lines.row(0).download(h_lines);
 
     if (h_votes_.needed())
     {
         h_votes_.create(1, d_lines.cols, CV_32SC1);
-        cv::Mat h_votes = h_votes_.getMat();
-        cv::gpu::GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast<int*>(d_lines.ptr<int>(1)));
+        Mat h_votes = h_votes_.getMat();
+        GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast<int*>(d_lines.ptr<int>(1)));
         d_votes.download(h_votes);
     }
 }
diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp
index 4f402da16d..06662d8d2e 100644
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -1129,63 +1129,67 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CornerMinEigen, testing::Combine(
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HoughLines
 
-PARAM_TEST_CASE(HoughLines, cv::gpu::DeviceInfo, std::string)
+PARAM_TEST_CASE(HoughLines, cv::gpu::DeviceInfo, cv::Size, UseRoi)
 {
-};
-
-void drawLines(cv::Mat& dst, const std::vector<cv::Vec2f>& lines)
-{
-    for (size_t i = 0; i < lines.size(); ++i)
+    void generateLines(cv::Mat& img)
     {
-        float rho = lines[i][0], theta = lines[i][1];
-        cv::Point pt1, pt2;
-        double a = std::cos(theta), b = std::sin(theta);
-        double x0 = a*rho, y0 = b*rho;
-        pt1.x = cvRound(x0 + 1000*(-b));
-        pt1.y = cvRound(y0 + 1000*(a));
-        pt2.x = cvRound(x0 - 1000*(-b));
-        pt2.y = cvRound(y0 - 1000*(a));
-        cv::line(dst, pt1, pt2, cv::Scalar::all(255));
+        img.setTo(cv::Scalar::all(0));
+
+        cv::line(img, cv::Point(20, 0), cv::Point(20, img.rows), cv::Scalar::all(255));
+        cv::line(img, cv::Point(0, 50), cv::Point(img.cols, 50), cv::Scalar::all(255));
+        cv::line(img, cv::Point(0, 0), cv::Point(img.cols, img.rows), cv::Scalar::all(255));
+        cv::line(img, cv::Point(img.cols, 0), cv::Point(0, img.rows), cv::Scalar::all(255));
     }
-}
+
+    void drawLines(cv::Mat& dst, const std::vector<cv::Vec2f>& lines)
+    {
+        dst.setTo(cv::Scalar::all(0));
+
+        for (size_t i = 0; i < lines.size(); ++i)
+        {
+            float rho = lines[i][0], theta = lines[i][1];
+            cv::Point pt1, pt2;
+            double a = std::cos(theta), b = std::sin(theta);
+            double x0 = a*rho, y0 = b*rho;
+            pt1.x = cvRound(x0 + 1000*(-b));
+            pt1.y = cvRound(y0 + 1000*(a));
+            pt2.x = cvRound(x0 - 1000*(-b));
+            pt2.y = cvRound(y0 - 1000*(a));
+            cv::line(dst, pt1, pt2, cv::Scalar::all(255));
+        }
+    }
+};
 
 TEST_P(HoughLines, Accuracy)
 {
     const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
     cv::gpu::setDevice(devInfo.deviceID());
-    const std::string fileName = GET_PARAM(1);
+    const cv::Size size = GET_PARAM(1);
+    const bool useRoi = GET_PARAM(2);
 
     const float rho = 1.0f;
-    const float theta = static_cast<float>(CV_PI / 180);
-    const int threshold = 50;
+    const float theta = 1.5f * CV_PI / 180.0f;
+    const int threshold = 100;
 
-    cv::Mat img = readImage(fileName, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    cv::Mat edges;
-    cv::Canny(img, edges, 50, 200);
+    cv::Mat src(size, CV_8UC1);
+    generateLines(src);
 
     cv::gpu::GpuMat d_lines;
-    cv::gpu::HoughLines(loadMat(edges), d_lines, rho, theta, threshold);
+    cv::gpu::HoughLines(loadMat(src, useRoi), d_lines, rho, theta, threshold);
+
     std::vector<cv::Vec2f> lines;
     cv::gpu::HoughLinesDownload(d_lines, lines);
-    cv::Mat dst(img.size(), CV_8UC1, cv::Scalar::all(0));
+
+    cv::Mat dst(size, CV_8UC1);
     drawLines(dst, lines);
 
-    std::vector<cv::Vec2f> lines_gold;
-    cv::HoughLines(edges, lines_gold, rho, theta, threshold);
-    cv::Mat dst_gold(img.size(), CV_8UC1, cv::Scalar::all(0));
-    drawLines(dst_gold, lines_gold);
-
-    ASSERT_MAT_NEAR(dst_gold, dst, 0.0);
+    ASSERT_MAT_NEAR(src, dst, 0.0);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HoughLines, testing::Combine(
     ALL_DEVICES,
-    testing::Values(std::string("../cv/shared/pic1.png"),
-                    std::string("../cv/shared/pic3.png"),
-                    std::string("../cv/shared/pic5.png"),
-                    std::string("../cv/shared/pic6.png"))));
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
 
 } // namespace