From db08656a3826fc392ca62a6cbc2b2584a54575cd Mon Sep 17 00:00:00 2001
From: Marina Kolpakova <no@email>
Date: Mon, 18 Jun 2012 10:31:36 +0000
Subject: [PATCH] resize area are fixed for scales that aren't divide 128

---
 modules/gpu/src/cuda/resize.cu   | 35 +++++++++++++++++++-------------
 modules/gpu/src/resize.cpp       |  1 +
 modules/gpu/test/test_resize.cpp | 16 +++++++--------
 3 files changed, 30 insertions(+), 22 deletions(-)
diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu
index b84b29f92a..844df9be2f 100644
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -537,7 +537,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         template<typename T, typename W>
-        __global__ void resise_scan_fast_x(const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines)
+        __global__ void resise_scan_fast_x(const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines, int stride)
         {
             extern __shared__ W sbuf[];
 
@@ -545,11 +545,14 @@ namespace cv { namespace gpu { namespace device
 
             // load line-block on shared memory
             int y = blockIdx.x / thred_lines;
-            int input_stride = (blockIdx.x - y * thred_lines) * blockDim.x;
+            int input_stride = (blockIdx.x % thred_lines) * stride;
             int x = input_stride  + tid;
 
             // store global data in shared memory
-            sbuf[tid] = src(y, x);
+            if (x  < src.cols && y < src.rows)
+                sbuf[tid] = src(y, x);
+            else
+                sbuf[tid] = 0;
             __syncthreads();
 
             scan_block<inclusive, W>(sbuf);
@@ -575,7 +578,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         template<typename T, typename W>
-        __global__ void resise_scan_fast_y(const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines)
+        __global__ void resise_scan_fast_y(const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines, int stride)
         {
             extern __shared__ W sbuf[];
 
@@ -584,13 +587,15 @@ namespace cv { namespace gpu { namespace device
             // load line-block on shared memory
             int x = blockIdx.x / thred_lines;
 
-            int global_stride = (blockIdx.x % thred_lines) * blockDim.x;
-            if (!tid) printf("STRIDE : %d", global_stride);
+            int global_stride = (blockIdx.x % thred_lines) * stride;
             int y = global_stride + tid;
 
             // store global data in shared memory
+            if (x  < src.cols && y < src.rows)
+                sbuf[tid] = src(y, x);
+            else
+                sbuf[tid] = 0;
 
-            sbuf[tid] = src(y, x);
             __syncthreads();
             scan_block<inclusive, W>(sbuf);
 
@@ -623,28 +628,30 @@ namespace cv { namespace gpu { namespace device
             int iscale_x = round(fx);
             int iscale_y = round(fy);
 
-            const int warps = 4;
+            int warps = 4;
             const int threads = 32 * warps;
+            int input_stride = threads / iscale_x;
 
-            int thred_lines = divUp(src.cols, threads);
+            int thred_lines = divUp(src.cols, input_stride * iscale_x);
             int blocks = src.rows * thred_lines;
 
-            printf("device code executed for X coordinate with:\nsize %d warps %d, threads %d, thred_lines %d, blocks %d\n",
-                   src.cols, warps, threads, thred_lines, blocks);
+            printf("device code executed for X coordinate with:\nsize %d warps %d, threads %d, thred_lines %d, blocks %d input strude %d\n",
+                   src.cols, warps, threads, thred_lines, blocks, input_stride * iscale_x);
 
             typedef typename scan_traits<T>::scan_line_type smem_type;
 
             resise_scan_fast_x<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
-                    (src, buffer, iscale_x, iscale_y, thred_lines);
+                    (src, buffer, iscale_x, iscale_y, thred_lines, input_stride * iscale_x);
 
-            thred_lines = divUp(src.rows, threads);
+            input_stride = threads / iscale_y;
+            thred_lines = divUp(src.rows, input_stride * iscale_y);
             blocks = dst.cols * thred_lines;
 
             printf("device code executed for Y coordinate with:\nsize %d warps %d, threads %d, thred_lines %d, blocks %d\n",
                    dst.rows, warps, threads, thred_lines, blocks);
 
             resise_scan_fast_y<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
-                    (buffer, dst, iscale_x, iscale_y, thred_lines);
+                    (buffer, dst, iscale_x, iscale_y, thred_lines, input_stride * iscale_y);
 
             cudaSafeCall( cudaGetLastError() );
 
diff --git a/modules/gpu/src/resize.cpp b/modules/gpu/src/resize.cpp
index ff41818848..25bdce42d2 100644
--- a/modules/gpu/src/resize.cpp
+++ b/modules/gpu/src/resize.cpp
@@ -95,6 +95,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, GpuMat& buffer, Size dsize,
     CV_Assert( (fx < 1.0) && (fy < 1.0));
     CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0));
     CV_Assert(src.cols >= 128 && src.rows >= 128);
+    CV_Assert((fx - 128.0) <= 0 && (fy - 128.0) <= 0);
 
     if (dsize == Size())
         dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
diff --git a/modules/gpu/test/test_resize.cpp b/modules/gpu/test/test_resize.cpp
index 5e03786828..81de33a1bd 100644
--- a/modules/gpu/test/test_resize.cpp
+++ b/modules/gpu/test/test_resize.cpp
@@ -201,13 +201,13 @@ TEST_P(ResizeArea, Accuracy)
    cv::Mat gpu;
    dst.download(gpu);
 
-   std::cout //<< src
+   // std::cout // << src
+   // // << std::endl << std::endl
+   // // << gpu_buff
+   // // << std::endl << std::endl
+   // << gpu
    // << std::endl << std::endl
-   // << gpu_buff
-   // << std::endl << std::endl
-   << gpu
-   << std::endl << std::endl
-   << dst_cpu<<  std::endl;
+   // << dst_cpu<<  std::endl;
 
 
     EXPECT_MAT_NEAR(dst_cpu, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
@@ -215,9 +215,9 @@ TEST_P(ResizeArea, Accuracy)
 
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeArea, testing::Combine(
     ALL_DEVICES,
-    testing::Values(cv::Size(640, 10 *  128)),//DIFFERENT_SIZES,
+    testing::Values(cv::Size(640, 480)),//DIFFERENT_SIZES,
     testing::Values(MatType(CV_8UC1)/*MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/),
-    testing::Values(0.1),
+    testing::Values(0.05, 0.1),
     testing::Values(Interpolation(cv::INTER_AREA)),
     WHOLE_SUBMAT));