diff --git a/modules/gpu/perf/perf_arithm.cpp b/modules/gpu/perf/perf_arithm.cpp index 2c915904c2..21e3f32900 100644 --- a/modules/gpu/perf/perf_arithm.cpp +++ b/modules/gpu/perf/perf_arithm.cpp @@ -716,3 +716,34 @@ PERF_TEST_P(DevInfo_Size_MatType, addWeighted, testing::Combine(testing::ValuesI SANITY_CHECK(dst_host); } + +PERF_TEST_P(DevInfo_Size_MatType_FlipCode, reduce, testing::Combine(testing::ValuesIn(devices()), + testing::Values(GPU_TYPICAL_MAT_SIZES), + testing::Values(CV_8UC1, CV_8UC4, CV_32FC1), + testing::Values((int)HORIZONTAL_AXIS, (int)VERTICAL_AXIS))) +{ + DeviceInfo devInfo = std::tr1::get<0>(GetParam()); + Size size = std::tr1::get<1>(GetParam()); + int type = std::tr1::get<2>(GetParam()); + int dim = std::tr1::get<3>(GetParam()); + + setDevice(devInfo.deviceID()); + + Mat src_host(size, type); + + declare.in(src_host, WARMUP_RNG); + + GpuMat src(src_host); + GpuMat dst(size, type); + + declare.time(0.5).iterations(100); + + SIMPLE_TEST_CYCLE() + { + reduce(src, dst, dim, CV_REDUCE_MIN); + } + + Mat dst_host = dst; + + SANITY_CHECK(dst_host); +} diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu index 722be26187..1dfd4b3b30 100644 --- a/modules/gpu/src/cuda/matrix_reductions.cu +++ b/modules/gpu/src/cuda/matrix_reductions.cu @@ -1894,27 +1894,29 @@ namespace cv { namespace gpu { namespace mathfunc const int x = blockIdx.x * 16 + threadIdx.x; + S myVal = op.startValue(); + if (x < src.cols) { - S myVal = op.startValue(); - for (int y = threadIdx.y; y < src.rows; y += 16) myVal = op(myVal, src.ptr(y)[x]); + } - smem[threadIdx.y * 16 + threadIdx.x] = myVal; - __syncthreads(); + smem[threadIdx.x * 16 + threadIdx.y] = myVal; + __syncthreads(); - if (threadIdx.y == 0) - { - myVal = smem[threadIdx.x]; - - #pragma unroll - for (int i = 1; i < 16; ++i) - myVal = op(myVal, smem[i * 16 + threadIdx.x]); - - dst[x] = saturate_cast(op.result(myVal, src.rows)); - } + if (threadIdx.x < 8) + { + volatile S* srow = smem + threadIdx.y * 16; + srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]); + srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]); + srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]); + srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]); } + __syncthreads(); + + if (threadIdx.y == 0 && x < src.cols) + dst[x] = saturate_cast(op.result(smem[threadIdx.x * 16], src.rows)); } template