From 424bc609b6f352a2b10f2a01ffcd037483e3591f Mon Sep 17 00:00:00 2001 From: noob Date: Wed, 29 Aug 2012 17:44:21 +0200 Subject: [PATCH] Retina module is now parallelized thanks to the TBB library. Speed increase expected on multicore plateforms --- modules/contrib/src/basicretinafilter.cpp | 120 ++++++---- modules/contrib/src/basicretinafilter.hpp | 277 +++++++++++++++++++--- modules/contrib/src/magnoretinafilter.cpp | 4 + modules/contrib/src/magnoretinafilter.hpp | 46 +++- modules/contrib/src/parvoretinafilter.cpp | 18 +- modules/contrib/src/parvoretinafilter.hpp | 39 +++ modules/contrib/src/retinacolor.cpp | 96 +++----- modules/contrib/src/retinacolor.hpp | 82 ++++++- modules/contrib/src/templatebuffer.hpp | 64 +++-- 9 files changed, 577 insertions(+), 169 deletions(-) diff --git a/modules/contrib/src/basicretinafilter.cpp b/modules/contrib/src/basicretinafilter.cpp index 06320f8117..a4270aa74d 100644 --- a/modules/contrib/src/basicretinafilter.cpp +++ b/modules/contrib/src/basicretinafilter.cpp @@ -316,28 +316,50 @@ void BasicRetinaFilter::runFilter_LocalAdapdation_autonomous(const std::valarray _spatiotemporalLPfilter(get_data(inputFrame), &_filterOutput[0]); _localLuminanceAdaptation(get_data(inputFrame), &_filterOutput[0], &outputFrame[0]); } -// local luminance adaptation of the input in regard of localLuminance buffer -void BasicRetinaFilter::_localLuminanceAdaptation(const float *inputFrame, const float *localLuminance, float *outputFrame) -{ - float meanLuminance=0; - const float *luminancePTR=inputFrame; - for (unsigned int i=0;i<_filterOutput.getNBpixels();++i) - meanLuminance+=*(luminancePTR++); - meanLuminance/=_filterOutput.getNBpixels(); - //float tempMeanValue=meanLuminance+_meanInputValue*_tau; - updateCompressionParameter(meanLuminance); +// local luminance adaptation of the input in regard of localLuminance buffer, the input is rewrited and becomes the output +void BasicRetinaFilter::_localLuminanceAdaptation(float *inputOutputFrame, const float *localLuminance) +{ + _localLuminanceAdaptation(inputOutputFrame, localLuminance, inputOutputFrame, false); + + /* const float *localLuminancePTR=localLuminance; + float *inputOutputFramePTR=inputOutputFrame; + + for (register unsigned int IDpixel=0 ; IDpixel<_filterOutput.getNBpixels() ; ++IDpixel, ++inputOutputFramePTR) + { + float X0=*(localLuminancePTR++)*_localLuminanceFactor+_localLuminanceAddon; + *(inputOutputFramePTR) = (_maxInputValue+X0)**inputOutputFramePTR/(*inputOutputFramePTR +X0+0.00000000001); + } + */ +} + +// local luminance adaptation of the input in regard of localLuminance buffer +void BasicRetinaFilter::_localLuminanceAdaptation(const float *inputFrame, const float *localLuminance, float *outputFrame, const bool updateLuminanceMean) +{ + if (updateLuminanceMean) + { float meanLuminance=0; + const float *luminancePTR=inputFrame; + for (unsigned int i=0;i<_filterOutput.getNBpixels();++i) + meanLuminance+=*(luminancePTR++); + meanLuminance/=_filterOutput.getNBpixels(); + //float tempMeanValue=meanLuminance+_meanInputValue*_tau; + updateCompressionParameter(meanLuminance); + } +#ifdef HAVE_TBB + tbb::parallel_for(tbb::blocked_range(0,_filterOutput.getNBpixels()), Parallel_localAdaptation(localLuminance, inputFrame, outputFrame, _localLuminanceFactor, _localLuminanceAddon, _maxInputValue), tbb::auto_partitioner()); +#else //std::cout<(IDrowStart,IDrowEnd), Parallel_horizontalCausalFilter_addInput(inputFrame, outputFrame, IDrowStart, _filterOutput.getNBcolumns(), _a, _tau), tbb::auto_partitioner()); +#else for (unsigned int IDrow=IDrowStart; IDrow(IDrowStart,IDrowEnd), Parallel_horizontalAnticausalFilter(outputFrame, IDrowEnd, _filterOutput.getNBcolumns(), _a ), tbb::auto_partitioner()); +#else for (unsigned int IDrow=IDrowStart; IDrow(IDcolumnStart,IDcolumnEnd), Parallel_verticalCausalFilter(outputFrame, _filterOutput.getNBrows(), _filterOutput.getNBcolumns(), _a ), tbb::auto_partitioner()); +#else + for (unsigned int IDcolumn=IDcolumnStart; IDcolumn(IDcolumnStart,IDcolumnEnd), Parallel_verticalAnticausalFilter_multGain(outputFrame, _filterOutput.getNBrows(), _filterOutput.getNBcolumns(), _a, _gain ), tbb::auto_partitioner()); +#else + float* offset=outputFrame+_filterOutput.getNBpixels()-_filterOutput.getNBcolumns(); //#pragma omp parallel for for (unsigned int IDcolumn=IDcolumnStart; IDcolumn(IDrowStart,IDrowEnd), Parallel_horizontalAnticausalFilter_Irregular(outputFrame, spatialConstantBuffer, IDrowEnd, _filterOutput.getNBcolumns()), tbb::auto_partitioner()); +#else register float* outputPTR=outputFrame+IDrowEnd*(_filterOutput.getNBcolumns())-1; - register const float* spatialConstantPTR=&_progressiveSpatialConstant[0]+IDrowEnd*(_filterOutput.getNBcolumns())-1; + register const float* spatialConstantPTR=spatialConstantBuffer+IDrowEnd*(_filterOutput.getNBcolumns())-1; for (unsigned int IDrow=IDrowStart; IDrow(IDcolumnStart,IDcolumnEnd), Parallel_verticalCausalFilter_Irregular(outputFrame, spatialConstantBuffer, _filterOutput.getNBrows(), _filterOutput.getNBcolumns()), tbb::auto_partitioner()); +#else for (unsigned int IDcolumn=IDcolumnStart; IDcolumn main idea paralellise main filters loops, then, only the most used methods are parallelized... TODO : increase the number of parallelised methods as necessary +** ==> functors names = Parallel_$$$ where $$$= the name of the serial method that is parallelised +** ==> functors constructors can differ from the parameters used with their related serial functions +*/ + +#define _DEBUG_TBB // define DEBUG_TBB in order to display additionnal data on stdout + class Parallel_horizontalAnticausalFilter + { + private: + float *outputFrame; + const unsigned int IDrowEnd, nbColumns; + const float filterParam_a; + public: + // constructor which takes the input image pointer reference reference and limits + Parallel_horizontalAnticausalFilter(float *bufferToProcess, const unsigned int idEnd, const unsigned int nbCols, const float a ) + :outputFrame(bufferToProcess), IDrowEnd(idEnd), nbColumns(nbCols), filterParam_a(a) + { +#ifdef DEBUG_TBB + std::cout<<"Parallel_horizontalAnticausalFilter::Parallel_horizontalAnticausalFilter :" + <<"\n\t idEnd="<& r ) const { + + for (size_t IDrow=r.begin(); IDrow!=r.end(); ++IDrow) + { + register float* outputPTR=outputFrame+(IDrowEnd-IDrow)*(nbColumns)-1; + register const float* spatialConstantPTR=spatialConstantBuffer+(IDrowEnd-IDrow)*(nbColumns)-1; + register float result=0; + for (unsigned int index=0; index& r ) const { + for (unsigned int IDcolumn=r.begin(); IDcolumn!=r.end(); ++IDcolumn) + { + register float result=0; + register float *outputPTR=outputFrame+IDcolumn; + register const float* spatialConstantPTR=spatialConstantBuffer+IDcolumn; + for (unsigned int index=0; index(0,_filterOutput.getNBpixels()), Parallel_amacrineCellsComputing(OPL_ON, OPL_OFF, &_previousInput_ON[0], &_previousInput_OFF[0], &_amacrinCellsTempOutput_ON[0], &_amacrinCellsTempOutput_OFF[0], _temporalCoefficient), tbb::auto_partitioner()); +#else register const float *OPL_ON_PTR=OPL_ON; register const float *OPL_OFF_PTR=OPL_OFF; register float *previousInput_ON_PTR= &_previousInput_ON[0]; @@ -175,6 +178,7 @@ void MagnoRetinaFilter::_amacrineCellsComputing(const float *OPL_ON, const float *(previousInput_OFF_PTR++)=*(OPL_OFF_PTR++); } +#endif } // launch filter that runs all the IPL filter diff --git a/modules/contrib/src/magnoretinafilter.hpp b/modules/contrib/src/magnoretinafilter.hpp index b5c9015ac6..daefb74485 100644 --- a/modules/contrib/src/magnoretinafilter.hpp +++ b/modules/contrib/src/magnoretinafilter.hpp @@ -190,10 +190,52 @@ private: // varialbles float _temporalCoefficient; - // amacrine cells filter : high pass temporal filter - void _amacrineCellsComputing(const float *ONinput, const float *OFFinput); + // amacrine cells filter : high pass temporal filter + void _amacrineCellsComputing(const float *ONinput, const float *OFFinput); +#ifdef HAVE_TBB +/****************************************************** +** IF TBB is useable, then, main loops are parallelized using these functors +** ==> main idea paralellise main filters loops, then, only the most used methods are parallelized... TODO : increase the number of parallelised methods as necessary +** ==> functors names = Parallel_$$$ where $$$= the name of the serial method that is parallelised +** ==> functors constructors can differ from the parameters used with their related serial functions +*/ + class Parallel_amacrineCellsComputing + { + private: + const float *OPL_ON, *OPL_OFF; + float *previousInput_ON, *previousInput_OFF, *amacrinCellsTempOutput_ON, *amacrinCellsTempOutput_OFF; + const float temporalCoefficient; + public: + Parallel_amacrineCellsComputing(const float *OPL_ON_PTR, const float *OPL_OFF_PTR, float *previousInput_ON_PTR, float *previousInput_OFF_PTR, float *amacrinCellsTempOutput_ON_PTR, float *amacrinCellsTempOutput_OFF_PTR, float temporalCoefficientVal) + :OPL_ON(OPL_ON_PTR), OPL_OFF(OPL_OFF_PTR), previousInput_ON(previousInput_ON_PTR), previousInput_OFF(previousInput_OFF_PTR), amacrinCellsTempOutput_ON(amacrinCellsTempOutput_ON_PTR), amacrinCellsTempOutput_OFF(amacrinCellsTempOutput_OFF_PTR), temporalCoefficient(temporalCoefficientVal) {} + + void operator()( const tbb::blocked_range& r ) const { + register const float *OPL_ON_PTR=OPL_ON+r.begin(); + register const float *OPL_OFF_PTR=OPL_OFF+r.begin(); + register float *previousInput_ON_PTR= previousInput_ON+r.begin(); + register float *previousInput_OFF_PTR= previousInput_OFF+r.begin(); + register float *amacrinCellsTempOutput_ON_PTR= amacrinCellsTempOutput_ON+r.begin(); + register float *amacrinCellsTempOutput_OFF_PTR= amacrinCellsTempOutput_OFF+r.begin(); + for (unsigned int IDpixel=r.begin() ; IDpixel!=r.end(); ++IDpixel) + { + /* Compute ON and OFF amacrin cells high pass temporal filter */ + float magnoXonPixelResult = temporalCoefficient*(*amacrinCellsTempOutput_ON_PTR+ *OPL_ON_PTR-*previousInput_ON_PTR); + *(amacrinCellsTempOutput_ON_PTR++)=((float)(magnoXonPixelResult>0))*magnoXonPixelResult; + + float magnoXoffPixelResult = temporalCoefficient*(*amacrinCellsTempOutput_OFF_PTR+ *OPL_OFF_PTR-*previousInput_OFF_PTR); + *(amacrinCellsTempOutput_OFF_PTR++)=((float)(magnoXoffPixelResult>0))*magnoXoffPixelResult; + + /* prepare next loop */ + *(previousInput_ON_PTR++)=*(OPL_ON_PTR++); + *(previousInput_OFF_PTR++)=*(OPL_OFF_PTR++); + + } + } + + }; +#endif }; } diff --git a/modules/contrib/src/parvoretinafilter.cpp b/modules/contrib/src/parvoretinafilter.cpp index a6cbf1b801..50d1b0ba68 100644 --- a/modules/contrib/src/parvoretinafilter.cpp +++ b/modules/contrib/src/parvoretinafilter.cpp @@ -199,17 +199,20 @@ const std::valarray &ParvoRetinaFilter::runFilter(const std::valarray(0,_filterOutput.getNBpixels()), Parallel_OPL_OnOffWaysComputing(&_photoreceptorsOutput[0], &_horizontalCellsOutput[0], &_bipolarCellsOutputON[0], &_bipolarCellsOutputOFF[0], &_parvocellularOutputON[0], &_parvocellularOutputOFF[0]), tbb::auto_partitioner()); +#else + float *photoreceptorsOutput_PTR= &_photoreceptorsOutput[0]; + float *horizontalCellsOutput_PTR= &_horizontalCellsOutput[0]; + float *bipolarCellsON_PTR = &_bipolarCellsOutputON[0]; + float *bipolarCellsOFF_PTR = &_bipolarCellsOutputOFF[0]; + float *parvocellularOutputON_PTR= &_parvocellularOutputON[0]; + float *parvocellularOutputOFF_PTR= &_parvocellularOutputOFF[0]; // compute bipolar cells response equal to photoreceptors minus horizontal cells response // and copy the result on parvo cellular outputs... keeping time before their local contrast adaptation for final result for (register unsigned int IDpixel=0 ; IDpixel<_filterOutput.getNBpixels() ; ++IDpixel) @@ -222,6 +225,7 @@ void ParvoRetinaFilter::_OPL_OnOffWaysComputing() *(parvocellularOutputON_PTR++)=*(bipolarCellsON_PTR++) = isPositive*pixelDifference; *(parvocellularOutputOFF_PTR++)=*(bipolarCellsOFF_PTR++)= (isPositive-1.0f)*pixelDifference; } +#endif } } diff --git a/modules/contrib/src/parvoretinafilter.hpp b/modules/contrib/src/parvoretinafilter.hpp index 54521b17b8..76f5506855 100644 --- a/modules/contrib/src/parvoretinafilter.hpp +++ b/modules/contrib/src/parvoretinafilter.hpp @@ -216,6 +216,45 @@ private: // private functions void _OPL_OnOffWaysComputing(); +#ifdef HAVE_TBB +/****************************************************** +** IF TBB is useable, then, main loops are parallelized using these functors +** ==> main idea paralellise main filters loops, then, only the most used methods are parallelized... TODO : increase the number of parallelised methods as necessary +** ==> functors names = Parallel_$$$ where $$$= the name of the serial method that is parallelised +** ==> functors constructors can differ from the parameters used with their related serial functions +*/ + class Parallel_OPL_OnOffWaysComputing + { + private: + float *photoreceptorsOutput, *horizontalCellsOutput, *bipolarCellsON, *bipolarCellsOFF, *parvocellularOutputON, *parvocellularOutputOFF; + public: + Parallel_OPL_OnOffWaysComputing(float *photoreceptorsOutput_PTR, float *horizontalCellsOutput_PTR, float *bipolarCellsON_PTR, float *bipolarCellsOFF_PTR, float *parvocellularOutputON_PTR, float *parvocellularOutputOFF_PTR) + :photoreceptorsOutput(photoreceptorsOutput_PTR), horizontalCellsOutput(horizontalCellsOutput_PTR), bipolarCellsON(bipolarCellsON_PTR), bipolarCellsOFF(bipolarCellsOFF_PTR), parvocellularOutputON(parvocellularOutputON_PTR), parvocellularOutputOFF(parvocellularOutputOFF_PTR) {} + + void operator()( const tbb::blocked_range& r ) const { + // compute bipolar cells response equal to photoreceptors minus horizontal cells response + // and copy the result on parvo cellular outputs... keeping time before their local contrast adaptation for final result + float *photoreceptorsOutput_PTR= photoreceptorsOutput+r.begin(); + float *horizontalCellsOutput_PTR= horizontalCellsOutput+r.begin(); + float *bipolarCellsON_PTR = bipolarCellsON+r.begin(); + float *bipolarCellsOFF_PTR = bipolarCellsOFF+r.begin(); + float *parvocellularOutputON_PTR= parvocellularOutputON+r.begin(); + float *parvocellularOutputOFF_PTR= parvocellularOutputOFF+r.begin(); + + for (register unsigned int IDpixel=r.begin() ; IDpixel!=r.end() ; ++IDpixel) + { + float pixelDifference = *(photoreceptorsOutput_PTR++) -*(horizontalCellsOutput_PTR++); + // test condition to allow write pixelDifference in ON or OFF buffer and 0 in the over + float isPositive=(float) (pixelDifference>0.0f); + + // ON and OFF channels writing step + *(parvocellularOutputON_PTR++)=*(bipolarCellsON_PTR++) = isPositive*pixelDifference; + *(parvocellularOutputOFF_PTR++)=*(bipolarCellsOFF_PTR++)= (isPositive-1.0f)*pixelDifference; + } + } + }; +#endif + }; } #endif diff --git a/modules/contrib/src/retinacolor.cpp b/modules/contrib/src/retinacolor.cpp index f98b0507dc..b452956997 100644 --- a/modules/contrib/src/retinacolor.cpp +++ b/modules/contrib/src/retinacolor.cpp @@ -89,7 +89,7 @@ RetinaColor::RetinaColor(const unsigned int NBrows, const unsigned int NBcolumns _demultiplexedColorFrame(NBrows*NBcolumns*3), _chrominance(NBrows*NBcolumns*3), _colorLocalDensity(NBrows*NBcolumns*3), - _imageGradient(NBrows*NBcolumns*3) + _imageGradient(NBrows*NBcolumns*2) { // link to parent buffers (let's recycle !) _luminance=&_filterOutput; @@ -126,12 +126,12 @@ RetinaColor::~RetinaColor() void RetinaColor::clearAllBuffers() { BasicRetinaFilter::clearAllBuffers(); - _tempMultiplexedFrame=0; - _demultiplexedTempBuffer=0; + _tempMultiplexedFrame=0.f; + _demultiplexedTempBuffer=0.f; - _demultiplexedColorFrame=0; - _chrominance=0; - _imageGradient=1; + _demultiplexedColorFrame=0.f; + _chrominance=0.f; + _imageGradient=0.57f; } /** @@ -149,7 +149,7 @@ void RetinaColor::resize(const unsigned int NBrows, const unsigned int NBcolumns _demultiplexedColorFrame.resize(NBrows*NBcolumns*3); _chrominance.resize(NBrows*NBcolumns*3); _colorLocalDensity.resize(NBrows*NBcolumns*3); - _imageGradient.resize(NBrows*NBcolumns*3); + _imageGradient.resize(NBrows*NBcolumns*2); // link to parent buffers (let's recycle !) _luminance=&_filterOutput; @@ -325,15 +325,15 @@ void RetinaColor::runColorDemultiplexing(const std::valarray &multiplexed }else { - register const float *multiplexedColorFramePTR1= get_data(multiplexedColorFrame); - for (unsigned int indexc=0; indexc<_filterOutput.getNBpixels() ; ++indexc, ++chrominancePTR, ++colorLocalDensityPTR, ++luminance, ++multiplexedColorFramePTR1) + register const float *multiplexedColorFramePTR= get_data(multiplexedColorFrame); + for (unsigned int indexc=0; indexc<_filterOutput.getNBpixels() ; ++indexc, ++chrominancePTR, ++colorLocalDensityPTR, ++luminance, ++multiplexedColorFramePTR) { // normalize by photoreceptors density float Cr=*(chrominancePTR)*_colorLocalDensity[indexc]; float Cg=*(chrominancePTR+_filterOutput.getNBpixels())*_colorLocalDensity[indexc+_filterOutput.getNBpixels()]; float Cb=*(chrominancePTR+_filterOutput.getDoubleNBpixels())*_colorLocalDensity[indexc+_filterOutput.getDoubleNBpixels()]; *luminance=(Cr+Cg+Cb)*_pG; - _demultiplexedTempBuffer[_colorSampling[indexc]] = *multiplexedColorFramePTR1 - *luminance; + _demultiplexedTempBuffer[_colorSampling[indexc]] = *multiplexedColorFramePTR - *luminance; } @@ -349,8 +349,9 @@ void RetinaColor::runColorDemultiplexing(const std::valarray &multiplexed _adaptiveSpatialLPfilter(&_demultiplexedTempBuffer[0]+_filterOutput.getNBpixels(), &_demultiplexedColorFrame[0]+_filterOutput.getNBpixels()); _adaptiveSpatialLPfilter(&_demultiplexedTempBuffer[0]+_filterOutput.getDoubleNBpixels(), &_demultiplexedColorFrame[0]+_filterOutput.getDoubleNBpixels()); - for (unsigned int index=0; index<_filterOutput.getNBpixels()*3 ; ++index) // cette boucle pourrait �tre supprimee en passant la densit� � la fonction de filtrage - _demultiplexedColorFrame[index] /= _chrominance[index]; +/* for (unsigned int index=0; index<_filterOutput.getNBpixels()*3 ; ++index) // cette boucle pourrait �tre supprimee en passant la densit� � la fonction de filtrage + _demultiplexedColorFrame[index] /= _chrominance[index];*/ + _demultiplexedColorFrame/=_chrominance; // more optimal ;o) // compute and substract the residual luminance for (unsigned int index=0; index<_filterOutput.getNBpixels() ; ++index) @@ -432,6 +433,9 @@ void RetinaColor::clipRGBOutput_0_maxInputValue(float *inputOutputBuffer, const if (inputOutputBuffer==NULL) inputOutputBuffer= &_demultiplexedColorFrame[0]; +#ifdef HAVE_TBB // call the TemplateBuffer TBB clipping method + tbb::parallel_for(tbb::blocked_range(0,_filterOutput.getNBpixels()*3), Parallel_clipBufferValues(inputOutputBuffer, 0, maxInputValue), tbb::auto_partitioner()); +#else register float *inputOutputBufferPTR=inputOutputBuffer; for (register unsigned int jf = 0; jf < _filterOutput.getNBpixels()*3; ++jf, ++inputOutputBufferPTR) { @@ -440,6 +444,7 @@ void RetinaColor::clipRGBOutput_0_maxInputValue(float *inputOutputBuffer, const else if (*inputOutputBufferPTR<0) *inputOutputBufferPTR=0; } +#endif //std::cout<<"RetinaColor::...normalizing RGB frame OK"< horizontal filters work with the first layer of imageGradient _adaptiveHorizontalCausalFilter_addInput(inputFrame, outputFrame, 0, _filterOutput.getNBrows()); - _adaptiveHorizontalAnticausalFilter(outputFrame, 0, _filterOutput.getNBrows()); - _adaptiveVerticalCausalFilter(outputFrame, 0, _filterOutput.getNBcolumns()); + _horizontalAnticausalFilter_Irregular(outputFrame, 0, _filterOutput.getNBrows(), &_imageGradient[0]); + // -> horizontal filters work with the second layer of imageGradient + _verticalCausalFilter_Irregular(outputFrame, 0, _filterOutput.getNBcolumns(), &_imageGradient[0]+_filterOutput.getNBpixels()); _adaptiveVerticalAnticausalFilter_multGain(outputFrame, 0, _filterOutput.getNBcolumns()); - } -// horizontal causal filter which adds the input inside +// horizontal causal filter which adds the input inside... replaces the parent _horizontalCausalFilter_Irregular_addInput by avoiding a product for each pixel void RetinaColor::_adaptiveHorizontalCausalFilter_addInput(const float *inputFrame, float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd) { +#ifdef HAVE_TBB + tbb::parallel_for(tbb::blocked_range(IDrowStart,IDrowEnd), Parallel_adaptiveHorizontalCausalFilter_addInput(inputFrame, outputFrame, &_imageGradient[0], _filterOutput.getNBcolumns()), tbb::auto_partitioner()); +#else register float* outputPTR=outputFrame+IDrowStart*_filterOutput.getNBcolumns(); register const float* inputPTR=inputFrame+IDrowStart*_filterOutput.getNBcolumns(); - register float *imageGradientPTR= &_imageGradient[0]+IDrowStart*_filterOutput.getNBcolumns(); + register const float *imageGradientPTR= &_imageGradient[0]+IDrowStart*_filterOutput.getNBcolumns(); for (unsigned int IDrow=IDrowStart; IDrow(IDcolumnStart,IDcolumnEnd), Parallel_adaptiveVerticalAnticausalFilter_multGain(outputFrame, &_imageGradient[0]+_filterOutput.getNBpixels(), _filterOutput.getNBrows(), _filterOutput.getNBcolumns(), _gain), tbb::auto_partitioner()); +#else float* outputOffset=outputFrame+_filterOutput.getNBpixels()-_filterOutput.getNBcolumns(); - float* gradOffset= &_imageGradient[0]+_filterOutput.getNBpixels()-_filterOutput.getNBcolumns(); + float* gradOffset= &_imageGradient[0]+_filterOutput.getNBpixels()*2-_filterOutput.getNBcolumns(); for (unsigned int IDcolumn=IDcolumnStart; IDcolumn special adaptive filters dedicated to low pass filtering on the chrominance (skeeps filtering on the edges) void _adaptiveSpatialLPfilter(const float *inputFrame, float *outputFrame); - void _adaptiveHorizontalCausalFilter_addInput(const float *inputFrame, float *outputFrame, const unsigned int IDrowStart, const unsigned int IDrowEnd); - void _adaptiveHorizontalAnticausalFilter(float *outputFrame, const unsigned int IDrowStart, const unsigned int IDrowEnd); - void _adaptiveVerticalCausalFilter(float *outputFrame, const unsigned int IDcolumnStart, const unsigned int IDcolumnEnd); + void _adaptiveHorizontalCausalFilter_addInput(const float *inputFrame, float *outputFrame, const unsigned int IDrowStart, const unsigned int IDrowEnd); // TBB parallelized void _adaptiveVerticalAnticausalFilter_multGain(float *outputFrame, const unsigned int IDcolumnStart, const unsigned int IDcolumnEnd); void _computeGradient(const float *luminance); void _normalizeOutputs_0_maxOutputValue(void); @@ -258,6 +256,84 @@ protected: // color space transform void _applyImageColorSpaceConversion(const std::valarray &inputFrame, std::valarray &outputFrame, const float *transformTable); +#ifdef HAVE_TBB +/****************************************************** +** IF TBB is useable, then, main loops are parallelized using these functors +** ==> main idea paralellise main filters loops, then, only the most used methods are parallelized... TODO : increase the number of parallelised methods as necessary +** ==> functors names = Parallel_$$$ where $$$= the name of the serial method that is parallelised +** ==> functors constructors can differ from the parameters used with their related serial functions +*/ + +/* Template : + class + { + private: + + public: + Parallel_() + : {} + + void operator()( const tbb::blocked_range& r ) const { + + } + }: +*/ + class Parallel_adaptiveHorizontalCausalFilter_addInput + { + private: + float *outputFrame; + const float *inputFrame, *imageGradient; + const unsigned int nbColumns; + public: + Parallel_adaptiveHorizontalCausalFilter_addInput(const float *inputImg, float *bufferToProcess, const float *imageGrad, const unsigned int nbCols) + :outputFrame(bufferToProcess), inputFrame(inputImg), imageGradient(imageGrad), nbColumns(nbCols) {}; + + void operator()( const tbb::blocked_range& r ) const { + register float* outputPTR=outputFrame+r.begin()*nbColumns; + register const float* inputPTR=inputFrame+r.begin()*nbColumns; + register const float *imageGradientPTR= imageGradient+r.begin()*nbColumns; + for (unsigned int IDrow=r.begin(); IDrow!=r.end(); ++IDrow) + { + register float result=0; + for (unsigned int index=0; index& r ) const { + float* offset=outputFrame+nbColumns*nbRows-nbColumns; + const float* gradOffset= imageGradient+nbColumns*nbRows-nbColumns; + for (unsigned int IDcolumn=r.begin(); IDcolumn!=r.end(); ++IDcolumn) + { + register float result=0; + register float *outputPTR=offset+IDcolumn; + register const float *imageGradientPTR=gradOffset+IDcolumn; + for (unsigned int index=0; index #include + +//// If TBB is used +// ==> then include required includes +#ifdef HAVE_TBB +#include "tbb/parallel_for.h" +#include "tbb/blocked_range.h" + +// ==> declare usefull generic tools +template +class Parallel_clipBufferValues +{ +private: + type *bufferToClip; + const type minValue, maxValue; + +public: + Parallel_clipBufferValues(type* bufferToProcess, const type min, const type max) + : bufferToClip(bufferToProcess), minValue(min), maxValue(max){} + + void operator()( const tbb::blocked_range& r ) const { + register type *inputOutputBufferPTR=bufferToClip+r.begin(); + for (register unsigned int jf = r.begin(); jf != r.end(); ++jf, ++inputOutputBufferPTR) + { + if (*inputOutputBufferPTR>maxValue) + *inputOutputBufferPTR=maxValue; + else if (*inputOutputBufferPTRmax()"<max()<<"maxThreshold="<min()"<min()<<"minThreshold="<Buffer(); - for (unsigned int i=0;isize();++i, ++bufferPTR) - { - if (*bufferPTRupdatedHighValue) - *bufferPTR=updatedHighValue; - } + std::cout<<"Tdebug"<max()"<max()<<"maxThreshold="<min()"<min()<<"minThreshold="<Buffer(); +#ifdef HAVE_TBB // call the TemplateBuffer TBB clipping method + tbb::parallel_for(tbb::blocked_range(0,this->size()), Parallel_clipBufferValues(bufferPTR, updatedLowValue, updatedHighValue), tbb::auto_partitioner()); +#else - normalizeGrayOutput_0_maxOutputValue(this->Buffer(), this->size(), maxOutputValue); + for (unsigned int i=0;isize();++i, ++bufferPTR) + { + if (*bufferPTRupdatedHighValue) + *bufferPTR=updatedHighValue; + } +#endif + normalizeGrayOutput_0_maxOutputValue(this->Buffer(), this->size(), maxOutputValue); }