From 2d8c89c40b46b8084534d116f1db5987a24d57c6 Mon Sep 17 00:00:00 2001 From: Chuanbo Weng Date: Thu, 4 Dec 2014 11:15:13 +0800 Subject: [PATCH] Remove unnecessary kercn limitation of 4. When accessing global memory by DWORD4, memory bandwidth can be fully utilized on Intel platform. This patch will make more image format(e.g. 8UC4) be processed in DWORD4 by work-item. After applying this patch, 3 subcase of ./opencv_perf_core --gtest_filter=OCL_RepeatFixture_Repeat.Repeat/* can be speedup on HD4000 graphics card with Beignet: OCL_RepeatFixture_Repeat.Repeat/2, 64% improvement. OCL_RepeatFixture_Repeat.Repeat/6, 50% improvement. OCL_RepeatFixture_Repeat.Repeat/8, 56% improvement. Signed-off-by: Chuanbo Weng --- modules/core/src/copy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 1c6882917b..301ea80a1f 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -846,7 +846,7 @@ static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst) int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1, - kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4); + kercn = ocl::predictOptimalVectorWidth(_src, _dst); ocl::Kernel k("repeat", ocl::core::repeat_oclsrc, format("-D T=%s -D nx=%d -D ny=%d -D rowsPerWI=%d -D cn=%d",