diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu index f0487758de..abfd8c38cb 100644 --- a/modules/gpu/src/cuda/match_template.cu +++ b/modules/gpu/src/cuda/match_template.cu @@ -175,7 +175,7 @@ void multiplyAndNormalizeSpects(int n, float scale, const cufftComplex* a, __global__ void matchTemplatePreparedKernel_8U_SQDIFF( - int w, int h, const PtrStepf image_sumsq, float templ_sumsq, + int w, int h, const PtrStep_ image_sqsum, float templ_sqsum, DevMem2Df result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; @@ -183,24 +183,80 @@ __global__ void matchTemplatePreparedKernel_8U_SQDIFF( if (x < result.cols && y < result.rows) { - float image_sq = image_sumsq.ptr(y + h)[x + w] - - image_sumsq.ptr(y)[x + w] - - image_sumsq.ptr(y + h)[x] - + image_sumsq.ptr(y)[x]; + float image_sq = (float)( + (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) - + (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x])); float ccorr = result.ptr(y)[x]; - result.ptr(y)[x] = image_sq - 2.f * ccorr + templ_sumsq; + result.ptr(y)[x] = image_sq - 2.f * ccorr + templ_sqsum; } } void matchTemplatePrepared_8U_SQDIFF( - int w, int h, const DevMem2Df image_sumsq, float templ_sumsq, + int w, int h, const DevMem2D_ image_sqsum, float templ_sqsum, DevMem2Df result) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); matchTemplatePreparedKernel_8U_SQDIFF<<>>( - w, h, image_sumsq, templ_sumsq, result); + w, h, image_sqsum, templ_sqsum, result); + cudaSafeCall(cudaThreadSynchronize()); +} + + +__global__ void matchTemplatePreparedKernel_8U_SQDIFF_NORMED( + int w, int h, const PtrStep_ image_sqsum, float templ_sqsum, + DevMem2Df result) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x < result.cols && y < result.rows) + { + float image_sq = (float)( + (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) - + (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x])); + float ccorr = result.ptr(y)[x]; + result.ptr(y)[x] = (image_sq - 2.f * ccorr + templ_sqsum) * + rsqrtf(image_sq * templ_sqsum); + } +} + + +void matchTemplatePrepared_8U_SQDIFF_NORMED( + int w, int h, const DevMem2D_ image_sqsum, float templ_sqsum, + DevMem2Df result) +{ + dim3 threads(32, 8); + dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); + matchTemplatePreparedKernel_8U_SQDIFF_NORMED<<>>( + w, h, image_sqsum, templ_sqsum, result); + cudaSafeCall(cudaThreadSynchronize()); +} + + +__global__ void normalizeKernel_8U(int w, int h, const PtrStep_ image_sqsum, + float templ_sqsum, DevMem2Df result) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x < result.cols && y < result.rows) + { + float image_sq = (float)( + (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) - + (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x])); + result.ptr(y)[x] *= rsqrtf(image_sq * templ_sqsum); + } +} + + +void normalize_8U(int w, int h, const DevMem2D_ image_sqsum, + float templ_sqsum, DevMem2Df result) +{ + dim3 threads(32, 8); + dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); + normalizeKernel_8U<<>>(w, h, image_sqsum, templ_sqsum, result); cudaSafeCall(cudaThreadSynchronize()); }