diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp index 3533cce69a..412afee8b8 100644 --- a/modules/ocl/src/hog.cpp +++ b/modules/ocl/src/hog.cpp @@ -1816,8 +1816,14 @@ void cv::ocl::device::hog::normalize_hists(int nbins, openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1, "-D CPU"); else + { + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName); + int wave_size = queryDeviceInfo(kernel); + char opt[32] = {0}; + sprintf(opt, "-D WAVE_SIZE=%d", wave_size); openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, - localThreads, args, -1, -1); + localThreads, args, -1, -1, opt); + } } void cv::ocl::device::hog::classify_hists(int win_height, int win_width, @@ -1879,8 +1885,14 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width, openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1, "-D CPU"); else + { + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName); + int wave_size = queryDeviceInfo(kernel); + char opt[32] = {0}; + sprintf(opt, "-D WAVE_SIZE=%d", wave_size); openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, - localThreads, args, -1, -1); + localThreads, args, -1, -1, opt); + } } void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl index 05d538330f..b9103380d6 100644 --- a/modules/ocl/src/opencl/objdetect_hog.cl +++ b/modules/ocl/src/opencl/objdetect_hog.cl @@ -318,6 +318,10 @@ float reduce_smem(volatile __local float* smem, int size) if (tid < 32) { if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; +#if WAVE_SIZE < 32 + } barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) { +#endif if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; @@ -418,6 +422,9 @@ __kernel void classify_hists_180_kernel( { smem[tid] = product = product + smem[tid + 32]; } +#if WAVE_SIZE < 32 + barrier(CLK_LOCAL_MEM_FENCE); +#endif if (tid < 16) { smem[tid] = product = product + smem[tid + 16]; @@ -487,6 +494,10 @@ __kernel void classify_hists_252_kernel( if (tid < 32) { smem[tid] = product = product + smem[tid + 32]; +#if WAVE_SIZE < 32 + } barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) { +#endif smem[tid] = product = product + smem[tid + 16]; smem[tid] = product = product + smem[tid + 8]; smem[tid] = product = product + smem[tid + 4]; @@ -553,6 +564,10 @@ __kernel void classify_hists_kernel( if (tid < 32) { smem[tid] = product = product + smem[tid + 32]; +#if WAVE_SIZE < 32 + } barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) { +#endif smem[tid] = product = product + smem[tid + 16]; smem[tid] = product = product + smem[tid + 8]; smem[tid] = product = product + smem[tid + 4]; diff --git a/modules/ocl/test/test_objdetect.cpp b/modules/ocl/test/test_objdetect.cpp index 86590f7981..bc719b0974 100644 --- a/modules/ocl/test/test_objdetect.cpp +++ b/modules/ocl/test/test_objdetect.cpp @@ -146,17 +146,17 @@ TEST_P(HOG, Detect) if (winSize.width == 48 && winSize.height == 96) { // daimler's base - ocl_hog.setSVMDetector(ocl_hog.getPeopleDetector48x96()); + ocl_hog.setSVMDetector(hog.getDaimlerPeopleDetector()); hog.setSVMDetector(hog.getDaimlerPeopleDetector()); } else if (winSize.width == 64 && winSize.height == 128) { - ocl_hog.setSVMDetector(ocl_hog.getPeopleDetector64x128()); + ocl_hog.setSVMDetector(hog.getDefaultPeopleDetector()); hog.setSVMDetector(hog.getDefaultPeopleDetector()); } else { - ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector()); + ocl_hog.setSVMDetector(hog.getDefaultPeopleDetector()); hog.setSVMDetector(hog.getDefaultPeopleDetector()); }