From b37aaa8303129dfeb6f2e7b151493055a0ea9ee6 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 14 May 2015 15:39:42 +0300 Subject: [PATCH] significantly improved parallel non-local means by using granularity parameter in parallel_for_ loop. Because the algorithm deals with sliding sums, it's essential that each thread has enough work to do, otherwise the algorithm gets higher theoretical complexity and thus there is no speedup comparing to 1-thread code (at best). --- modules/photo/src/denoising.cpp | 44 ++++++++++++++++++--------- modules/photo/test/test_denoising.cpp | 11 +++++++ 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp index c68d09b925..93d4b4ebbe 100644 --- a/modules/photo/src/denoising.cpp +++ b/modules/photo/src/denoising.cpp @@ -50,42 +50,50 @@ static void fastNlMeansDenoising_( const Mat& src, Mat& dst, const std::vector( - src, dst, templateWindowSize, searchWindowSize, &h[0])); + src, dst, templateWindowSize, searchWindowSize, &h[0]), + granularity); break; case 2: if (hn == 1) parallel_for_(cv::Range(0, src.rows), FastNlMeansDenoisingInvoker, IT, UIT, D, int>( - src, dst, templateWindowSize, searchWindowSize, &h[0])); + src, dst, templateWindowSize, searchWindowSize, &h[0]), + granularity); else parallel_for_(cv::Range(0, src.rows), FastNlMeansDenoisingInvoker, IT, UIT, D, Vec2i>( - src, dst, templateWindowSize, searchWindowSize, &h[0])); + src, dst, templateWindowSize, searchWindowSize, &h[0]), + granularity); break; case 3: if (hn == 1) parallel_for_(cv::Range(0, src.rows), FastNlMeansDenoisingInvoker, IT, UIT, D, int>( - src, dst, templateWindowSize, searchWindowSize, &h[0])); + src, dst, templateWindowSize, searchWindowSize, &h[0]), + granularity); else parallel_for_(cv::Range(0, src.rows), FastNlMeansDenoisingInvoker, IT, UIT, D, Vec3i>( - src, dst, templateWindowSize, searchWindowSize, &h[0])); + src, dst, templateWindowSize, searchWindowSize, &h[0]), + granularity); break; case 4: if (hn == 1) parallel_for_(cv::Range(0, src.rows), FastNlMeansDenoisingInvoker, IT, UIT, D, int>( - src, dst, templateWindowSize, searchWindowSize, &h[0])); + src, dst, templateWindowSize, searchWindowSize, &h[0]), + granularity); else parallel_for_(cv::Range(0, src.rows), FastNlMeansDenoisingInvoker, IT, UIT, D, Vec4i>( - src, dst, templateWindowSize, searchWindowSize, &h[0])); + src, dst, templateWindowSize, searchWindowSize, &h[0]), + granularity); break; default: CV_Error(Error::StsBadArg, @@ -237,6 +245,7 @@ static void fastNlMeansDenoisingMulti_( const std::vector& srcImgs, Mat& ds int templateWindowSize, int searchWindowSize) { int hn = (int)h.size(); + double granularity = (double)std::max(1., (double)dst.total()/(1 << 16)); switch (srcImgs[0].type()) { @@ -244,43 +253,50 @@ static void fastNlMeansDenoisingMulti_( const std::vector& srcImgs, Mat& ds parallel_for_(cv::Range(0, srcImgs[0].rows), FastNlMeansMultiDenoisingInvoker( srcImgs, imgToDenoiseIndex, temporalWindowSize, - dst, templateWindowSize, searchWindowSize, &h[0])); + dst, templateWindowSize, searchWindowSize, &h[0]), + granularity); break; case CV_8UC2: if (hn == 1) parallel_for_(cv::Range(0, srcImgs[0].rows), FastNlMeansMultiDenoisingInvoker, IT, UIT, D, int>( srcImgs, imgToDenoiseIndex, temporalWindowSize, - dst, templateWindowSize, searchWindowSize, &h[0])); + dst, templateWindowSize, searchWindowSize, &h[0]), + granularity); else parallel_for_(cv::Range(0, srcImgs[0].rows), FastNlMeansMultiDenoisingInvoker, IT, UIT, D, Vec2i>( srcImgs, imgToDenoiseIndex, temporalWindowSize, - dst, templateWindowSize, searchWindowSize, &h[0])); + dst, templateWindowSize, searchWindowSize, &h[0]), + granularity); break; case CV_8UC3: if (hn == 1) parallel_for_(cv::Range(0, srcImgs[0].rows), FastNlMeansMultiDenoisingInvoker, IT, UIT, D, int>( srcImgs, imgToDenoiseIndex, temporalWindowSize, - dst, templateWindowSize, searchWindowSize, &h[0])); + dst, templateWindowSize, searchWindowSize, &h[0]), + granularity); else parallel_for_(cv::Range(0, srcImgs[0].rows), FastNlMeansMultiDenoisingInvoker, IT, UIT, D, Vec3i>( srcImgs, imgToDenoiseIndex, temporalWindowSize, - dst, templateWindowSize, searchWindowSize, &h[0])); + dst, templateWindowSize, searchWindowSize, &h[0]), + granularity); break; case CV_8UC4: if (hn == 1) parallel_for_(cv::Range(0, srcImgs[0].rows), FastNlMeansMultiDenoisingInvoker, IT, UIT, D, int>( srcImgs, imgToDenoiseIndex, temporalWindowSize, - dst, templateWindowSize, searchWindowSize, &h[0])); + dst, templateWindowSize, searchWindowSize, &h[0]), + granularity); else parallel_for_(cv::Range(0, srcImgs[0].rows), FastNlMeansMultiDenoisingInvoker, IT, UIT, D, Vec4i>( srcImgs, imgToDenoiseIndex, temporalWindowSize, - dst, templateWindowSize, searchWindowSize, &h[0])); + dst, templateWindowSize, searchWindowSize, &h[0]), + granularity); break; default: CV_Error(Error::StsBadArg, diff --git a/modules/photo/test/test_denoising.cpp b/modules/photo/test/test_denoising.cpp index 9808e9cddc..c3a69a2f76 100644 --- a/modules/photo/test/test_denoising.cpp +++ b/modules/photo/test/test_denoising.cpp @@ -156,3 +156,14 @@ TEST(Photo_White, issue_2646) ASSERT_EQ(0, nonWhitePixelsCount); } + +TEST(Photo_Denoising, speed) +{ + string imgname = string(cvtest::TS::ptr()->get_data_path()) + "shared/5MP.png"; + Mat src = imread(imgname, 0), dst; + + double t = (double)getTickCount(); + fastNlMeansDenoising(src, dst, 5, 7, 21); + t = (double)getTickCount() - t; + printf("execution time: %gms\n", t*1000./getTickFrequency()); +}