diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu index a34a52a313..09187fd259 100644 --- a/modules/gpu/src/cuda/integral_image.cu +++ b/modules/gpu/src/cuda/integral_image.cu @@ -361,14 +361,8 @@ namespace cv { namespace gpu { namespace device { { // each thread handles 16 values, use 1 block/row - int block = img.cols / 16; - // save, becouse step is actually can't be less 512 bytes - int align = img.cols % 4; - if ( align != 0) - { - block += (4 - align); - } + int block = integral.cols / 16; // launch 1 block / row const int grid = img.rows; diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 81a2248fdb..309b14ae9f 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -553,44 +553,25 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S src.locateROI(whole, offset); - if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048) + if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048 && offset.x % 16 == 0 && (src.cols + 63) / 64 <= (src.step - offset.x)) { - GpuMat srcAlligned; + ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer); - if (src.cols % 16 == 0 && src.rows % 8 == 0 && offset.x % 16 == 0 && offset.y % 8 == 0) - srcAlligned = src; - else - { - ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 15) / 16) * 16, src.type(), buffer); - - GpuMat inner = buffer(Rect(0, 0, src.cols, src.rows)); - - if (s) - { - s.enqueueMemSet(buffer, Scalar::all(0)); - s.enqueueCopy(src, inner); - } - else - { - buffer.setTo(Scalar::all(0)); - src.copyTo(inner); - } - - srcAlligned = buffer; - } - - sum.create(srcAlligned.rows + 1, srcAlligned.cols + 4, CV_32SC1); + cv::gpu::device::imgproc::shfl_integral_gpu(src, buffer, stream); + sum.create(src.rows + 1, src.cols + 1, CV_32SC1); if (s) s.enqueueMemSet(sum, Scalar::all(0)); else sum.setTo(Scalar::all(0)); - GpuMat inner = sum(Rect(4, 1, srcAlligned.cols, srcAlligned.rows)); + GpuMat inner = sum(Rect(1, 1, src.cols, src.rows)); + GpuMat res = buffer(Rect(0, 0, src.cols, src.rows)); - cv::gpu::device::imgproc::shfl_integral_gpu(srcAlligned, inner, stream); - - sum = sum(Rect(3, 0, src.cols + 1, src.rows + 1)); + if (s) + s.enqueueCopy(res, inner); + else + res.copyTo(inner); } else {