diff --git a/modules/imgproc/src/color.simd_helpers.hpp b/modules/imgproc/src/color.simd_helpers.hpp new file mode 100644 index 0000000000..70e7844277 --- /dev/null +++ b/modules/imgproc/src/color.simd_helpers.hpp @@ -0,0 +1,673 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +#include "opencv2/imgproc.hpp" +#include "opencv2/core/utility.hpp" +#include +#include "opencl_kernels_imgproc.hpp" +#include "hal_replacement.hpp" +#include "opencv2/core/hal/intrin.hpp" +#include "opencv2/core/softfloat.hpp" + +#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) + +namespace cv +{ + +//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601 +const float B2YF = 0.114f; +const float G2YF = 0.587f; +const float R2YF = 0.299f; + +enum +{ + yuv_shift = 14, + xyz_shift = 12, + R2Y = 4899, // == R2YF*16384 + G2Y = 9617, // == G2YF*16384 + B2Y = 1868, // == B2YF*16384 + BLOCK_SIZE = 256 +}; + +template struct ColorChannel +{ + typedef float worktype_f; + static _Tp max() { return std::numeric_limits<_Tp>::max(); } + static _Tp half() { return (_Tp)(max()/2 + 1); } +}; + +template<> struct ColorChannel +{ + typedef float worktype_f; + static float max() { return 1.f; } + static float half() { return 0.5f; } +}; + +/*template<> struct ColorChannel +{ + typedef double worktype_f; + static double max() { return 1.; } + static double half() { return 0.5; } +};*/ + +// +// Helper functions +// + +namespace { + +inline bool isHSV(int code) +{ + switch(code) + { + case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL: + case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL: + return true; + default: + return false; + } +} + +inline bool isLab(int code) +{ + switch (code) + { + case COLOR_Lab2BGR: case COLOR_Lab2RGB: case COLOR_Lab2LBGR: case COLOR_Lab2LRGB: + case COLOR_BGR2Lab: case COLOR_RGB2Lab: case COLOR_LBGR2Lab: case COLOR_LRGB2Lab: + return true; + default: + return false; + } +} + +inline bool is_sRGB(int code) +{ + switch (code) + { + case COLOR_BGR2Lab: case COLOR_RGB2Lab: case COLOR_BGR2Luv: case COLOR_RGB2Luv: + case COLOR_Lab2BGR: case COLOR_Lab2RGB: case COLOR_Luv2BGR: case COLOR_Luv2RGB: + return true; + default: + return false; + } +} + +inline bool swapBlue(int code) +{ + switch (code) + { + case COLOR_BGR2BGRA: case COLOR_BGRA2BGR: + case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: + case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: + case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY: + case COLOR_BGR2YCrCb: case COLOR_BGR2YUV: + case COLOR_YCrCb2BGR: case COLOR_YUV2BGR: + case COLOR_BGR2XYZ: case COLOR_XYZ2BGR: + case COLOR_BGR2HSV: case COLOR_BGR2HLS: case COLOR_BGR2HSV_FULL: case COLOR_BGR2HLS_FULL: + case COLOR_YUV2BGR_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2BGR_IYUV: case COLOR_YUV2BGRA_IYUV: + case COLOR_YUV2BGR_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2BGRA_NV12: + case COLOR_Lab2BGR: case COLOR_Luv2BGR: case COLOR_Lab2LBGR: case COLOR_Luv2LBGR: + case COLOR_BGR2Lab: case COLOR_BGR2Luv: case COLOR_LBGR2Lab: case COLOR_LBGR2Luv: + case COLOR_HSV2BGR: case COLOR_HLS2BGR: case COLOR_HSV2BGR_FULL: case COLOR_HLS2BGR_FULL: + case COLOR_YUV2BGR_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2BGR_YUY2: + case COLOR_YUV2BGRA_YUY2: case COLOR_YUV2BGR_YVYU: case COLOR_YUV2BGRA_YVYU: + case COLOR_BGR2YUV_IYUV: case COLOR_BGRA2YUV_IYUV: case COLOR_BGR2YUV_YV12: case COLOR_BGRA2YUV_YV12: + return false; + default: + return true; + } +} + +inline bool isFullRangeHSV(int code) +{ + switch (code) + { + case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL: + case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL: + return true; + default: + return false; + } +} + +inline int dstChannels(int code) +{ + switch( code ) + { + case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2RGBA: + case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA: + case COLOR_GRAY2BGRA: + case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12: + case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV: + case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU: + case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2: + + return 4; + + case COLOR_BGRA2BGR: case COLOR_RGBA2BGR: case COLOR_RGB2BGR: + case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB: + case COLOR_GRAY2BGR: + case COLOR_YUV2BGR_NV21: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12: + case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: + case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU: + case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: + + return 3; + + default: + return 0; + } +} + +inline int greenBits(int code) +{ + switch( code ) + { + case COLOR_BGR2BGR565: case COLOR_RGB2BGR565: case COLOR_BGRA2BGR565: case COLOR_RGBA2BGR565: + case COLOR_BGR5652BGR: case COLOR_BGR5652RGB: case COLOR_BGR5652BGRA: case COLOR_BGR5652RGBA: + case COLOR_BGR5652GRAY: case COLOR_GRAY2BGR565: + + return 6; + + case COLOR_BGR2BGR555: case COLOR_RGB2BGR555: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR555: + case COLOR_BGR5552BGR: case COLOR_BGR5552RGB: case COLOR_BGR5552BGRA: case COLOR_BGR5552RGBA: + case COLOR_BGR5552GRAY: case COLOR_GRAY2BGR555: + + return 5; + + default: + return 0; + } +} + +inline int uIndex(int code) +{ + switch( code ) + { + case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12: + + return 2; + + case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU: + case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV: + case COLOR_YUV2BGR_NV21: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21: + case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12: + + return 1; + + case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12: + case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV: + case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY: + case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2: + + return 0; + + default: + return -1; + } +} + +} // namespace:: + +template +struct Set +{ + static bool contains(int i) + { + return (i == i0 || i == i1 || i == i2); + } +}; + +template +struct Set +{ + static bool contains(int i) + { + return (i == i0 || i == i1); + } +}; + +template +struct Set +{ + static bool contains(int i) + { + return (i == i0); + } +}; + +enum SizePolicy +{ + TO_YUV, FROM_YUV, NONE +}; + +template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE > +struct CvtHelper +{ + CvtHelper(InputArray _src, OutputArray _dst, int dcn) + { + CV_Assert(!_src.empty()); + + int stype = _src.type(); + scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype); + + CV_Check(scn, VScn::contains(scn), "Invalid number of channels in input image"); + CV_Check(dcn, VDcn::contains(dcn), "Invalid number of channels in output image"); + CV_CheckDepth(depth, VDepth::contains(depth), "Unsupported depth of input image"); + + if (_src.getObj() == _dst.getObj()) // inplace processing (#6653) + _src.copyTo(src); + else + src = _src.getMat(); + Size sz = src.size(); + switch (sizePolicy) + { + case TO_YUV: + CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0); + dstSz = Size(sz.width, sz.height / 2 * 3); + break; + case FROM_YUV: + CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0); + dstSz = Size(sz.width, sz.height * 2 / 3); + break; + case NONE: + default: + dstSz = sz; + break; + } + _dst.create(dstSz, CV_MAKETYPE(depth, dcn)); + dst = _dst.getMat(); + } + Mat src, dst; + int depth, scn; + Size dstSz; +}; + +#ifdef HAVE_OPENCL + +template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE > +struct OclHelper +{ + OclHelper( InputArray _src, OutputArray _dst, int dcn) : + nArgs(0) + { + src = _src.getUMat(); + Size sz = src.size(), dstSz; + int scn = src.channels(); + int depth = src.depth(); + + CV_Assert( VScn::contains(scn) && VDcn::contains(dcn) && VDepth::contains(depth) ); + switch (sizePolicy) + { + case TO_YUV: + CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 ); + dstSz = Size(sz.width, sz.height / 2 * 3); + break; + case FROM_YUV: + CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 ); + dstSz = Size(sz.width, sz.height * 2 / 3); + break; + case NONE: + default: + dstSz = sz; + break; + } + + _dst.create(dstSz, CV_MAKETYPE(depth, dcn)); + dst = _dst.getUMat(); + } + + bool createKernel(cv::String name, ocl::ProgramSource& source, cv::String options) + { + ocl::Device dev = ocl::Device::getDefault(); + int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1; + int pxPerWIx = 1; + + cv::String baseOptions = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ", + src.depth(), src.channels(), pxPerWIy); + + switch (sizePolicy) + { + case TO_YUV: + if (dev.isIntel() && + src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 && + dst.step % 4 == 0 && dst.offset % 4 == 0) + { + pxPerWIx = 2; + } + globalSize[0] = (size_t)dst.cols/(2*pxPerWIx); + globalSize[1] = ((size_t)dst.rows/3 + pxPerWIy - 1) / pxPerWIy; + baseOptions += format("-D PIX_PER_WI_X=%d ", pxPerWIx); + break; + case FROM_YUV: + globalSize[0] = (size_t)dst.cols/2; + globalSize[1] = ((size_t)dst.rows/2 + pxPerWIy - 1) / pxPerWIy; + break; + case NONE: + default: + globalSize[0] = (size_t)src.cols; + globalSize[1] = ((size_t)src.rows + pxPerWIy - 1) / pxPerWIy; + break; + } + + k.create(name.c_str(), source, baseOptions + options); + + if(k.empty()) + return false; + + nArgs = k.set(0, ocl::KernelArg::ReadOnlyNoSize(src)); + nArgs = k.set(nArgs, ocl::KernelArg::WriteOnly(dst)); + return true; + } + + bool run() + { + return k.run(2, globalSize, NULL, false); + } + + template + void setArg(const T& arg) + { + nArgs = k.set(nArgs, arg); + } + + UMat src, dst; + ocl::Kernel k; + size_t globalSize[2]; + int nArgs; +}; + +#endif + +///////////////////////////// Top-level template function //////////////////////////////// + +template +class CvtColorLoop_Invoker : public ParallelLoopBody +{ + typedef typename Cvt::channel_type _Tp; +public: + + CvtColorLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt) : + ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), + width(width_), cvt(_cvt) + { + } + + virtual void operator()(const Range& range) const CV_OVERRIDE + { + CV_TRACE_FUNCTION(); + + const uchar* yS = src_data + static_cast(range.start) * src_step; + uchar* yD = dst_data + static_cast(range.start) * dst_step; + + for( int i = range.start; i < range.end; ++i, yS += src_step, yD += dst_step ) + cvt(reinterpret_cast(yS), reinterpret_cast<_Tp*>(yD), width); + } + +private: + const uchar * src_data; + const size_t src_step; + uchar * dst_data; + const size_t dst_step; + const int width; + const Cvt& cvt; + + const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&); +}; + +template +void CvtColorLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt) +{ + parallel_for_(Range(0, height), + CvtColorLoop_Invoker(src_data, src_step, dst_data, dst_step, width, cvt), + (width * height) / static_cast(1<<16)); +} + +#if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700) +# define NEED_IPP 1 +#else +# define NEED_IPP 0 +#endif + +#if NEED_IPP + +#define MAX_IPP8u 255 +#define MAX_IPP16u 65535 +#define MAX_IPP32f 1.0 + +typedef IppStatus (CV_STDCALL* ippiReorderFunc)(const void *, int, void *, int, IppiSize, const int *); +typedef IppStatus (CV_STDCALL* ippiGeneralFunc)(const void *, int, void *, int, IppiSize); +typedef IppStatus (CV_STDCALL* ippiColor2GrayFunc)(const void *, int, void *, int, IppiSize, const Ipp32f *); + +template +class CvtColorIPPLoop_Invoker : + public ParallelLoopBody +{ +public: + + CvtColorIPPLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt, bool *_ok) : + ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), width(width_), cvt(_cvt), ok(_ok) + { + *ok = true; + } + + virtual void operator()(const Range& range) const CV_OVERRIDE + { + const void *yS = src_data + src_step * range.start; + void *yD = dst_data + dst_step * range.start; + if( !cvt(yS, static_cast(src_step), yD, static_cast(dst_step), width, range.end - range.start) ) + *ok = false; + else + { + CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); + } + } + +private: + const uchar * src_data; + const size_t src_step; + uchar * dst_data; + const size_t dst_step; + const int width; + const Cvt& cvt; + bool *ok; + + const CvtColorIPPLoop_Invoker& operator= (const CvtColorIPPLoop_Invoker&); +}; + + +template +bool CvtColorIPPLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt) +{ + bool ok; + parallel_for_(Range(0, height), CvtColorIPPLoop_Invoker(src_data, src_step, dst_data, dst_step, width, cvt, &ok), (width * height)/(double)(1<<16) ); + return ok; +} + + +template +bool CvtColorIPPLoopCopy(const uchar * src_data, size_t src_step, int src_type, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt) +{ + Mat temp; + Mat src(Size(width, height), src_type, const_cast(src_data), src_step); + Mat source = src; + if( src_data == dst_data ) + { + src.copyTo(temp); + source = temp; + } + bool ok; + parallel_for_(Range(0, source.rows), + CvtColorIPPLoop_Invoker(source.data, source.step, dst_data, dst_step, + source.cols, cvt, &ok), + source.total()/(double)(1<<16) ); + return ok; +} + + +struct IPPGeneralFunctor +{ + IPPGeneralFunctor(ippiGeneralFunc _func) : ippiColorConvertGeneral(_func){} + bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const + { + return ippiColorConvertGeneral ? CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, src, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0 : false; + } +private: + ippiGeneralFunc ippiColorConvertGeneral; +}; + + +struct IPPReorderFunctor +{ + IPPReorderFunctor(ippiReorderFunc _func, int _order0, int _order1, int _order2) : ippiColorConvertReorder(_func) + { + order[0] = _order0; + order[1] = _order1; + order[2] = _order2; + order[3] = 3; + } + bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const + { + return ippiColorConvertReorder ? CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, src, srcStep, dst, dstStep, ippiSize(cols, rows), order) >= 0 : false; + } +private: + ippiReorderFunc ippiColorConvertReorder; + int order[4]; +}; + + +struct IPPReorderGeneralFunctor +{ + IPPReorderGeneralFunctor(ippiReorderFunc _func1, ippiGeneralFunc _func2, int _order0, int _order1, int _order2, int _depth) : + ippiColorConvertReorder(_func1), ippiColorConvertGeneral(_func2), depth(_depth) + { + order[0] = _order0; + order[1] = _order1; + order[2] = _order2; + order[3] = 3; + } + bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const + { + if (ippiColorConvertReorder == 0 || ippiColorConvertGeneral == 0) + return false; + + Mat temp; + temp.create(rows, cols, CV_MAKETYPE(depth, 3)); + if(CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows), order) < 0) + return false; + return CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows)) >= 0; + } +private: + ippiReorderFunc ippiColorConvertReorder; + ippiGeneralFunc ippiColorConvertGeneral; + int order[4]; + int depth; +}; + + +struct IPPGeneralReorderFunctor +{ + IPPGeneralReorderFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _order0, int _order1, int _order2, int _depth) : + ippiColorConvertGeneral(_func1), ippiColorConvertReorder(_func2), depth(_depth) + { + order[0] = _order0; + order[1] = _order1; + order[2] = _order2; + order[3] = 3; + } + bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const + { + if (ippiColorConvertGeneral == 0 || ippiColorConvertReorder == 0) + return false; + + Mat temp; + temp.create(rows, cols, CV_MAKETYPE(depth, 3)); + if(CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0) + return false; + return CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0; + } +private: + ippiGeneralFunc ippiColorConvertGeneral; + ippiReorderFunc ippiColorConvertReorder; + int order[4]; + int depth; +}; + +extern ippiReorderFunc ippiSwapChannelsC3C4RTab[8]; +extern ippiReorderFunc ippiSwapChannelsC4C3RTab[8]; +extern ippiReorderFunc ippiSwapChannelsC3RTab[8]; + +#endif + +#ifdef HAVE_OPENCL + +bool oclCvtColorBGR2Luv( InputArray _src, OutputArray _dst, int bidx, bool srgb ); +bool oclCvtColorBGR2Lab( InputArray _src, OutputArray _dst, int bidx, bool srgb ); +bool oclCvtColorLab2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool srgb); +bool oclCvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool srgb); +bool oclCvtColorBGR2XYZ( InputArray _src, OutputArray _dst, int bidx ); +bool oclCvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx ); + +bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full ); +bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full ); +bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full ); +bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full ); + +bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool reverse ); +bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits ); +bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int gbits ); +bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits ); +bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits ); +bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx ); +bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn ); +bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst ); +bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst ); + +bool oclCvtColorBGR2YCrCb( InputArray _src, OutputArray _dst, int bidx); +bool oclCvtcolorYCrCb2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx); +bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx ); +bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx ); + +bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx ); +bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx ); +bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx ); +bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx, int uidx ); +bool oclCvtColorYUV2Gray_420( InputArray _src, OutputArray _dst ); + +#endif + +void cvtColorBGR2Lab( InputArray _src, OutputArray _dst, bool swapb, bool srgb); +void cvtColorBGR2Luv( InputArray _src, OutputArray _dst, bool swapb, bool srgb); +void cvtColorLab2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool srgb ); +void cvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool srgb ); +void cvtColorBGR2XYZ( InputArray _src, OutputArray _dst, bool swapb ); +void cvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb ); + +void cvtColorBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, bool crcb); +void cvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb); + +void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn); +void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx ); +void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx ); +void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx ); +void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx); +void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst ); +void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi ); + +void cvtColorBGR2HLS( InputArray _src, OutputArray _dst, bool swapb, bool fullRange ); +void cvtColorBGR2HSV( InputArray _src, OutputArray _dst, bool swapb, bool fullRange ); +void cvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange); +void cvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange); + +void cvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb); +void cvtColorBGR25x5( InputArray _src, OutputArray _dst, bool swapb, int gbits); +void cvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int gbits); +void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb); +void cvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn); +void cvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits); +void cvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits); +void cvtColorRGBA2mRGBA(InputArray _src, OutputArray _dst); +void cvtColormRGBA2RGBA(InputArray _src, OutputArray _dst); + +} //namespace cv diff --git a/modules/imgproc/src/color_hsv.simd.hpp b/modules/imgproc/src/color_hsv.simd.hpp new file mode 100644 index 0000000000..f0a4c87558 --- /dev/null +++ b/modules/imgproc/src/color_hsv.simd.hpp @@ -0,0 +1,1565 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +#include "precomp.hpp" +#include "color.hpp" + +namespace cv +{ + +////////////////////////////////////// RGB <-> HSV /////////////////////////////////////// + + +struct RGB2HSV_b +{ + typedef uchar channel_type; + + RGB2HSV_b(int _srccn, int _blueIdx, int _hrange) + : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) + { + CV_Assert( hrange == 180 || hrange == 256 ); + } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int i, bidx = blueIdx, scn = srccn; + const int hsv_shift = 12; + + static int sdiv_table[256]; + static int hdiv_table180[256]; + static int hdiv_table256[256]; + static volatile bool initialized = false; + + int hr = hrange; + const int* hdiv_table = hr == 180 ? hdiv_table180 : hdiv_table256; + n *= 3; + + if( !initialized ) + { + sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0; + for( i = 1; i < 256; i++ ) + { + sdiv_table[i] = saturate_cast((255 << hsv_shift)/(1.*i)); + hdiv_table180[i] = saturate_cast((180 << hsv_shift)/(6.*i)); + hdiv_table256[i] = saturate_cast((256 << hsv_shift)/(6.*i)); + } + initialized = true; + } + + for( i = 0; i < n; i += 3, src += scn ) + { + int b = src[bidx], g = src[1], r = src[bidx^2]; + int h, s, v = b; + int vmin = b; + int vr, vg; + + CV_CALC_MAX_8U( v, g ); + CV_CALC_MAX_8U( v, r ); + CV_CALC_MIN_8U( vmin, g ); + CV_CALC_MIN_8U( vmin, r ); + + uchar diff = saturate_cast(v - vmin); + vr = v == r ? -1 : 0; + vg = v == g ? -1 : 0; + + s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift; + h = (vr & (g - b)) + + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff)))); + h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift; + h += h < 0 ? hr : 0; + + dst[i] = saturate_cast(h); + dst[i+1] = (uchar)s; + dst[i+2] = (uchar)v; + } + } + + int srccn, blueIdx, hrange; +}; + + +struct RGB2HSV_f +{ + typedef float channel_type; + + RGB2HSV_f(int _srccn, int _blueIdx, float _hrange) + : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) { + #if CV_SIMD128 + hasSIMD = hasSIMD128(); + #endif + } + + #if CV_SIMD128 + inline void process(v_float32x4& v_r, v_float32x4& v_g, + v_float32x4& v_b, float hscale) const + { + v_float32x4 v_min_rgb = v_min(v_min(v_r, v_g), v_b); + v_float32x4 v_max_rgb = v_max(v_max(v_r, v_g), v_b); + + v_float32x4 v_eps = v_setall_f32(FLT_EPSILON); + v_float32x4 v_diff = v_max_rgb - v_min_rgb; + v_float32x4 v_s = v_diff / (v_abs(v_max_rgb) + v_eps); + + v_float32x4 v_r_eq_max = v_r == v_max_rgb; + v_float32x4 v_g_eq_max = v_g == v_max_rgb; + v_float32x4 v_h = v_select(v_r_eq_max, v_g - v_b, + v_select(v_g_eq_max, v_b - v_r, v_r - v_g)); + v_float32x4 v_res = v_select(v_r_eq_max, (v_g < v_b) & v_setall_f32(360.0f), + v_select(v_g_eq_max, v_setall_f32(120.0f), v_setall_f32(240.0f))); + v_float32x4 v_rev_diff = v_setall_f32(60.0f) / (v_diff + v_eps); + v_r = v_muladd(v_h, v_rev_diff, v_res) * v_setall_f32(hscale); + + v_g = v_s; + v_b = v_max_rgb; + } + #endif + + void operator()(const float* src, float* dst, int n) const + { + int i = 0, bidx = blueIdx, scn = srccn; + float hscale = hrange*(1.f/360.f); + n *= 3; + + #if CV_SIMD128 + if (hasSIMD) + { + if (scn == 3) { + if (bidx) { + for ( ; i <= n - 12; i += 12, src += scn * 4) + { + v_float32x4 v_r; + v_float32x4 v_g; + v_float32x4 v_b; + v_load_deinterleave(src, v_r, v_g, v_b); + process(v_r, v_g, v_b, hscale); + v_store_interleave(dst + i, v_r, v_g, v_b); + } + } else { + for ( ; i <= n - 12; i += 12, src += scn * 4) + { + v_float32x4 v_r; + v_float32x4 v_g; + v_float32x4 v_b; + v_load_deinterleave(src, v_r, v_g, v_b); + process(v_b, v_g, v_r, hscale); + v_store_interleave(dst + i, v_b, v_g, v_r); + } + } + } else { // scn == 4 + if (bidx) { + for ( ; i <= n - 12; i += 12, src += scn * 4) + { + v_float32x4 v_r; + v_float32x4 v_g; + v_float32x4 v_b; + v_float32x4 v_a; + v_load_deinterleave(src, v_r, v_g, v_b, v_a); + process(v_r, v_g, v_b, hscale); + v_store_interleave(dst + i, v_r, v_g, v_b); + } + } else { + for ( ; i <= n - 12; i += 12, src += scn * 4) + { + v_float32x4 v_r; + v_float32x4 v_g; + v_float32x4 v_b; + v_float32x4 v_a; + v_load_deinterleave(src, v_r, v_g, v_b, v_a); + process(v_b, v_g, v_r, hscale); + v_store_interleave(dst + i, v_b, v_g, v_r); + } + } + } + } + #endif + + for( ; i < n; i += 3, src += scn ) + { + float b = src[bidx], g = src[1], r = src[bidx^2]; + float h, s, v; + + float vmin, diff; + + v = vmin = r; + if( v < g ) v = g; + if( v < b ) v = b; + if( vmin > g ) vmin = g; + if( vmin > b ) vmin = b; + + diff = v - vmin; + s = diff/(float)(fabs(v) + FLT_EPSILON); + diff = (float)(60./(diff + FLT_EPSILON)); + if( v == r ) + h = (g - b)*diff; + else if( v == g ) + h = (b - r)*diff + 120.f; + else + h = (r - g)*diff + 240.f; + + if( h < 0 ) h += 360.f; + + dst[i] = h*hscale; + dst[i+1] = s; + dst[i+2] = v; + } + } + + int srccn, blueIdx; + float hrange; + #if CV_SIMD128 + bool hasSIMD; + #endif +}; + + +#if CV_SIMD128 +inline void HSV2RGB_simd(v_float32x4& v_h, v_float32x4& v_s, v_float32x4& v_v, float hscale) +{ + v_h = v_h * v_setall_f32(hscale); + v_float32x4 v_pre_sector = v_cvt_f32(v_trunc(v_h)); + v_h = v_h - v_pre_sector; + v_float32x4 v_tab0 = v_v; + v_float32x4 v_one = v_setall_f32(1.0f); + v_float32x4 v_tab1 = v_v * (v_one - v_s); + v_float32x4 v_tab2 = v_v * (v_one - (v_s * v_h)); + v_float32x4 v_tab3 = v_v * (v_one - (v_s * (v_one - v_h))); + + v_float32x4 v_one_sixth = v_setall_f32(1.0f / 6.0f); + v_float32x4 v_sector = v_pre_sector * v_one_sixth; + v_sector = v_cvt_f32(v_trunc(v_sector)); + v_float32x4 v_six = v_setall_f32(6.0f); + v_sector = v_pre_sector - (v_sector * v_six); + + v_float32x4 v_two = v_setall_f32(2.0f); + v_h = v_tab1 & (v_sector < v_two); + v_h = v_h | (v_tab3 & (v_sector == v_two)); + v_float32x4 v_three = v_setall_f32(3.0f); + v_h = v_h | (v_tab0 & (v_sector == v_three)); + v_float32x4 v_four = v_setall_f32(4.0f); + v_h = v_h | (v_tab0 & (v_sector == v_four)); + v_h = v_h | (v_tab2 & (v_sector > v_four)); + + v_s = v_tab3 & (v_sector < v_one); + v_s = v_s | (v_tab0 & (v_sector == v_one)); + v_s = v_s | (v_tab0 & (v_sector == v_two)); + v_s = v_s | (v_tab2 & (v_sector == v_three)); + v_s = v_s | (v_tab1 & (v_sector > v_three)); + + v_v = v_tab0 & (v_sector < v_one); + v_v = v_v | (v_tab2 & (v_sector == v_one)); + v_v = v_v | (v_tab1 & (v_sector == v_two)); + v_v = v_v | (v_tab1 & (v_sector == v_three)); + v_v = v_v | (v_tab3 & (v_sector == v_four)); + v_v = v_v | (v_tab0 & (v_sector > v_four)); +} +#endif + + +inline void HSV2RGB_native(const float* src, float* dst, const float hscale, const int bidx) +{ + float h = src[0], s = src[1], v = src[2]; + float b, g, r; + + if( s == 0 ) + b = g = r = v; + else + { + static const int sector_data[][3]= + {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; + float tab[4]; + int sector; + h *= hscale; + if( h < 0 ) + do h += 6; while( h < 0 ); + else if( h >= 6 ) + do h -= 6; while( h >= 6 ); + sector = cvFloor(h); + h -= sector; + if( (unsigned)sector >= 6u ) + { + sector = 0; + h = 0.f; + } + + tab[0] = v; + tab[1] = v*(1.f - s); + tab[2] = v*(1.f - s*h); + tab[3] = v*(1.f - s*(1.f - h)); + + b = tab[sector_data[sector][0]]; + g = tab[sector_data[sector][1]]; + r = tab[sector_data[sector][2]]; + } + + dst[bidx] = b; + dst[1] = g; + dst[bidx^2] = r; +} + + +struct HSV2RGB_f +{ + typedef float channel_type; + + HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange) + : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) { + #if CV_SIMD128 + hasSIMD = hasSIMD128(); + #endif + } + + void operator()(const float* src, float* dst, int n) const + { + int i = 0, bidx = blueIdx, dcn = dstcn; + n *= 3; + + if (dcn == 3) + { + #if CV_SIMD128 + if (hasSIMD) + { + for (; i <= n - 12; i += 12, dst += dcn * 4) + { + v_float32x4 v_src[3]; + v_load_deinterleave(src + i, v_src[0], v_src[1], v_src[2]); + HSV2RGB_simd(v_src[0], v_src[1], v_src[2], hscale); + v_store_interleave(dst, v_src[bidx], v_src[1], v_src[bidx^2]); + } + } + #endif + for( ; i < n; i += 3, dst += dcn ) + { + HSV2RGB_native(src + i, dst, hscale, bidx); + } + } else { // dcn == 4 + float alpha = ColorChannel::max(); + #if CV_SIMD128 + if (hasSIMD) + { + for (; i <= n - 12; i += 12, dst += dcn * 4) + { + v_float32x4 v_src[3]; + v_load_deinterleave(src + i, v_src[0], v_src[1], v_src[2]); + HSV2RGB_simd(v_src[0], v_src[1], v_src[2], hscale); + v_float32x4 v_a = v_setall_f32(alpha); + v_store_interleave(dst, v_src[bidx], v_src[1], v_src[bidx^2], v_a); + } + } + #endif + for( ; i < n; i += 3, dst += dcn ) + { + HSV2RGB_native(src + i, dst, hscale, bidx); + dst[3] = alpha; + } + } + } + + int dstcn, blueIdx; + float hscale; + #if CV_SIMD128 + bool hasSIMD; + #endif +}; + + +struct HSV2RGB_b +{ + typedef uchar channel_type; + + HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange) + : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.0f / _hrange) + { + #if CV_SIMD128 + hasSIMD = hasSIMD128(); + #endif + } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int j = 0, dcn = dstcn; + uchar alpha = ColorChannel::max(); + + #if CV_SIMD128 + if (hasSIMD) + { + for (j = 0; j <= (n - 16) * 3; j += 48, dst += dcn * 16) + { + v_uint8x16 h_b, s_b, v_b; + v_uint16x8 h_w[2], s_w[2], v_w[2]; + v_uint32x4 h_u[4], s_u[4], v_u[4]; + v_load_deinterleave(src + j, h_b, s_b, v_b); + v_expand(h_b, h_w[0], h_w[1]); + v_expand(s_b, s_w[0], s_w[1]); + v_expand(v_b, v_w[0], v_w[1]); + v_expand(h_w[0], h_u[0], h_u[1]); + v_expand(h_w[1], h_u[2], h_u[3]); + v_expand(s_w[0], s_u[0], s_u[1]); + v_expand(s_w[1], s_u[2], s_u[3]); + v_expand(v_w[0], v_u[0], v_u[1]); + v_expand(v_w[1], v_u[2], v_u[3]); + + v_int32x4 b_i[4], g_i[4], r_i[4]; + v_float32x4 v_coeff0 = v_setall_f32(1.0f / 255.0f); + v_float32x4 v_coeff1 = v_setall_f32(255.0f); + + for( int k = 0; k < 4; k++ ) + { + v_float32x4 v_src[3]; + v_src[0] = v_cvt_f32(v_reinterpret_as_s32(h_u[k])); + v_src[1] = v_cvt_f32(v_reinterpret_as_s32(s_u[k])); + v_src[2] = v_cvt_f32(v_reinterpret_as_s32(v_u[k])); + + v_src[1] *= v_coeff0; + v_src[2] *= v_coeff0; + HSV2RGB_simd(v_src[0], v_src[1], v_src[2], hscale); + + v_src[0] *= v_coeff1; + v_src[1] *= v_coeff1; + v_src[2] *= v_coeff1; + b_i[k] = v_trunc(v_src[0]); + g_i[k] = v_trunc(v_src[1]); + r_i[k] = v_trunc(v_src[2]); + } + + v_uint16x8 r_w[2], g_w[2], b_w[2]; + v_uint8x16 r_b, g_b, b_b; + + r_w[0] = v_pack_u(r_i[0], r_i[1]); + r_w[1] = v_pack_u(r_i[2], r_i[3]); + r_b = v_pack(r_w[0], r_w[1]); + g_w[0] = v_pack_u(g_i[0], g_i[1]); + g_w[1] = v_pack_u(g_i[2], g_i[3]); + g_b = v_pack(g_w[0], g_w[1]); + b_w[0] = v_pack_u(b_i[0], b_i[1]); + b_w[1] = v_pack_u(b_i[2], b_i[3]); + b_b = v_pack(b_w[0], b_w[1]); + + if( dcn == 3 ) + { + if( blueIdx == 0 ) + v_store_interleave(dst, b_b, g_b, r_b); + else + v_store_interleave(dst, r_b, g_b, b_b); + } + else + { + v_uint8x16 alpha_b = v_setall_u8(alpha); + if( blueIdx == 0 ) + v_store_interleave(dst, b_b, g_b, r_b, alpha_b); + else + v_store_interleave(dst, r_b, g_b, b_b, alpha_b); + } + } + } + #endif + for( ; j < n * 3; j += 3, dst += dcn ) + { + float buf[6]; + buf[0] = src[j]; + buf[1] = src[j+1] * (1.0f / 255.0f); + buf[2] = src[j+2] * (1.0f / 255.0f); + HSV2RGB_native(buf, buf + 3, hscale, blueIdx); + dst[0] = saturate_cast(buf[3] * 255.0f); + dst[1] = saturate_cast(buf[4] * 255.0f); + dst[2] = saturate_cast(buf[5] * 255.0f); + if( dcn == 4 ) + dst[3] = alpha; + } + } + + int dstcn; + int blueIdx; + float hscale; + #if CV_SIMD128 + bool hasSIMD; + #endif +}; + + +///////////////////////////////////// RGB <-> HLS //////////////////////////////////////// + +struct RGB2HLS_f +{ + typedef float channel_type; + + RGB2HLS_f(int _srccn, int _blueIdx, float _hrange) + : srccn(_srccn), blueIdx(_blueIdx), hscale(_hrange/360.f) { + #if CV_SIMD128 + hasSIMD = hasSIMD128(); + #endif + } + + #if CV_SIMD128 + inline void process(v_float32x4& v_r, v_float32x4& v_g, + v_float32x4& v_b, v_float32x4& v_hscale) const + { + v_float32x4 v_max_rgb = v_max(v_max(v_r, v_g), v_b); + v_float32x4 v_min_rgb = v_min(v_min(v_r, v_g), v_b); + + v_float32x4 v_diff = v_max_rgb - v_min_rgb; + v_float32x4 v_sum = v_max_rgb + v_min_rgb; + v_float32x4 v_half = v_setall_f32(0.5f); + v_float32x4 v_l = v_sum * v_half; + + v_float32x4 v_s = v_diff / v_select(v_l < v_half, v_sum, v_setall_f32(2.0f) - v_sum); + + v_float32x4 v_r_eq_max = v_max_rgb == v_r; + v_float32x4 v_g_eq_max = v_max_rgb == v_g; + v_float32x4 v_h = v_select(v_r_eq_max, v_g - v_b, + v_select(v_g_eq_max, v_b - v_r, v_r - v_g)); + v_float32x4 v_res = v_select(v_r_eq_max, (v_g < v_b) & v_setall_f32(360.0f), + v_select(v_g_eq_max, v_setall_f32(120.0f), v_setall_f32(240.0f))); + v_float32x4 v_rev_diff = v_setall_f32(60.0f) / v_diff; + v_h = v_muladd(v_h, v_rev_diff, v_res) * v_hscale; + + v_float32x4 v_diff_gt_eps = v_diff > v_setall_f32(FLT_EPSILON); + v_r = v_diff_gt_eps & v_h; + v_g = v_l; + v_b = v_diff_gt_eps & v_s; + } + #endif + + void operator()(const float* src, float* dst, int n) const + { + int i = 0, bidx = blueIdx, scn = srccn; + n *= 3; + + #if CV_SIMD128 + if (hasSIMD) + { + v_float32x4 v_hscale = v_setall_f32(hscale); + if (scn == 3) { + if (bidx) { + for ( ; i <= n - 12; i += 12, src += scn * 4) + { + v_float32x4 v_r; + v_float32x4 v_g; + v_float32x4 v_b; + v_load_deinterleave(src, v_r, v_g, v_b); + process(v_r, v_g, v_b, v_hscale); + v_store_interleave(dst + i, v_r, v_g, v_b); + } + } else { + for ( ; i <= n - 12; i += 12, src += scn * 4) + { + v_float32x4 v_r; + v_float32x4 v_g; + v_float32x4 v_b; + v_load_deinterleave(src, v_r, v_g, v_b); + process(v_b, v_g, v_r, v_hscale); + v_store_interleave(dst + i, v_b, v_g, v_r); + } + } + } else { // scn == 4 + if (bidx) { + for ( ; i <= n - 12; i += 12, src += scn * 4) + { + v_float32x4 v_r; + v_float32x4 v_g; + v_float32x4 v_b; + v_float32x4 v_a; + v_load_deinterleave(src, v_r, v_g, v_b, v_a); + process(v_r, v_g, v_b, v_hscale); + v_store_interleave(dst + i, v_r, v_g, v_b); + } + } else { + for ( ; i <= n - 12; i += 12, src += scn * 4) + { + v_float32x4 v_r; + v_float32x4 v_g; + v_float32x4 v_b; + v_float32x4 v_a; + v_load_deinterleave(src, v_r, v_g, v_b, v_a); + process(v_b, v_g, v_r, v_hscale); + v_store_interleave(dst + i, v_b, v_g, v_r); + } + } + } + } + #endif + + for( ; i < n; i += 3, src += scn ) + { + float b = src[bidx], g = src[1], r = src[bidx^2]; + float h = 0.f, s = 0.f, l; + float vmin, vmax, diff; + + vmax = vmin = r; + if( vmax < g ) vmax = g; + if( vmax < b ) vmax = b; + if( vmin > g ) vmin = g; + if( vmin > b ) vmin = b; + + diff = vmax - vmin; + l = (vmax + vmin)*0.5f; + + if( diff > FLT_EPSILON ) + { + s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin); + diff = 60.f/diff; + + if( vmax == r ) + h = (g - b)*diff; + else if( vmax == g ) + h = (b - r)*diff + 120.f; + else + h = (r - g)*diff + 240.f; + + if( h < 0.f ) h += 360.f; + } + + dst[i] = h*hscale; + dst[i+1] = l; + dst[i+2] = s; + } + } + + int srccn, blueIdx; + float hscale; + #if CV_SIMD128 + bool hasSIMD; + #endif +}; + + +struct RGB2HLS_b +{ + typedef uchar channel_type; + + RGB2HLS_b(int _srccn, int _blueIdx, int _hrange) + : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange) + { + #if CV_NEON + v_scale_inv = vdupq_n_f32(1.f/255.f); + v_scale = vdupq_n_f32(255.f); + v_alpha = vdup_n_u8(ColorChannel::max()); + #elif CV_SSE2 + v_scale_inv = _mm_set1_ps(1.f/255.f); + v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); + #endif + } + + #if CV_SSE2 + void process(const float * buf, + __m128 & v_coeffs, uchar * dst) const + { + __m128 v_l0f = _mm_load_ps(buf); + __m128 v_l1f = _mm_load_ps(buf + 4); + __m128 v_u0f = _mm_load_ps(buf + 8); + __m128 v_u1f = _mm_load_ps(buf + 12); + + v_l0f = _mm_mul_ps(v_l0f, v_coeffs); + v_u1f = _mm_mul_ps(v_u1f, v_coeffs); + v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x92)); + v_u0f = _mm_mul_ps(v_u0f, v_coeffs); + v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x92)); + v_l1f = _mm_mul_ps(v_l1f, v_coeffs); + + __m128i v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f)); + __m128i v_u = _mm_packs_epi32(_mm_cvtps_epi32(v_u0f), _mm_cvtps_epi32(v_u1f)); + __m128i v_l0 = _mm_packus_epi16(v_l, v_u); + + _mm_storeu_si128((__m128i *)(dst), v_l0); + } + #endif + + void operator()(const uchar* src, uchar* dst, int n) const + { + int i, j, scn = srccn; + float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; + #if CV_SSE2 + __m128 v_coeffs = _mm_set_ps(1.f, 255.f, 255.f, 1.f); + #endif + + for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 ) + { + int dn = std::min(n - i, (int)BLOCK_SIZE); + j = 0; + + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn) + { + uint16x8_t v_t0, v_t1, v_t2; + + if (scn == 3) + { + uint8x8x3_t v_src = vld3_u8(src); + v_t0 = vmovl_u8(v_src.val[0]); + v_t1 = vmovl_u8(v_src.val[1]); + v_t2 = vmovl_u8(v_src.val[2]); + } + else + { + uint8x8x4_t v_src = vld4_u8(src); + v_t0 = vmovl_u8(v_src.val[0]); + v_t1 = vmovl_u8(v_src.val[1]); + v_t2 = vmovl_u8(v_src.val[2]); + } + + float32x4x3_t v_dst; + v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv); + v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv); + v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv); + vst3q_f32(buf + j, v_dst); + + v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv); + v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv); + v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); + vst3q_f32(buf + j + 12, v_dst); + } + #elif CV_SSE2 + if (scn == 3 && haveSIMD) + { + for ( ; j <= (dn * 3 - 16); j += 16, src += 16) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)src); + + __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero); + _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv)); + _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv)); + + v_src_p = _mm_unpackhi_epi8(v_src, v_zero); + _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv)); + _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv)); + } + + int jr = j % 3; + if (jr) + src -= jr, j -= jr; + } + else if (scn == 4 && haveSIMD) + { + for ( ; j <= (dn * 3 - 12); j += 12, src += 16) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)src); + + __m128i v_src_lo = _mm_unpacklo_epi8(v_src, v_zero); + __m128i v_src_hi = _mm_unpackhi_epi8(v_src, v_zero); + _mm_storeu_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_lo, v_zero)), v_scale_inv)); + _mm_storeu_ps(buf + j + 3, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_lo, v_zero)), v_scale_inv)); + _mm_storeu_ps(buf + j + 6, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_hi, v_zero)), v_scale_inv)); + float tmp = buf[j + 8]; + _mm_storeu_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_unpackhi_epi16(v_src_hi, v_zero), 0x90)), v_scale_inv)); + buf[j + 8] = tmp; + } + + int jr = j % 3; + if (jr) + src -= jr, j -= jr; + } + #endif + for( ; j < dn*3; j += 3, src += scn ) + { + buf[j] = src[0]*(1.f/255.f); + buf[j+1] = src[1]*(1.f/255.f); + buf[j+2] = src[2]*(1.f/255.f); + } + cvt(buf, buf, dn); + + j = 0; + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24) + { + float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12); + + uint8x8x3_t v_dst; + v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_src0.val[0])), + vqmovn_u32(cv_vrndq_u32_f32(v_src1.val[0])))); + v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale))))); + v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale))))); + vst3_u8(dst + j, v_dst); + } + #elif CV_SSE2 + if (haveSIMD) + { + for ( ; j <= (dn - 16) * 3; j += 48) + { + process(buf + j, + v_coeffs, dst + j); + + process(buf + j + 16, + v_coeffs, dst + j + 16); + + process(buf + j + 32, + v_coeffs, dst + j + 32); + } + } + #endif + for( ; j < dn*3; j += 3 ) + { + dst[j] = saturate_cast(buf[j]); + dst[j+1] = saturate_cast(buf[j+1]*255.f); + dst[j+2] = saturate_cast(buf[j+2]*255.f); + } + } + } + + int srccn; + RGB2HLS_f cvt; + #if CV_NEON + float32x4_t v_scale, v_scale_inv; + uint8x8_t v_alpha; + #elif CV_SSE2 + __m128 v_scale_inv; + __m128i v_zero; + bool haveSIMD; + #endif +}; + + +struct HLS2RGB_f +{ + typedef float channel_type; + + HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange) + : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) { + #if CV_SIMD128 + hasSIMD = hasSIMD128(); + #endif + } + + #if CV_SIMD128 + inline void process(v_float32x4& v_h, v_float32x4& v_l, v_float32x4& v_s) const + { + v_float32x4 v_one = v_setall_f32(1.0f); + + v_float32x4 v_l_le_half = v_l <= v_setall_f32(0.5f); + v_float32x4 v_ls = v_l * v_s; + v_float32x4 v_elem0 = v_select(v_l_le_half, v_ls, v_s - v_ls); + + v_float32x4 v_hs_raw = v_h * v_setall_f32(hscale); + v_float32x4 v_pre_hs = v_cvt_f32(v_trunc(v_hs_raw)); + v_float32x4 v_hs = v_hs_raw - v_pre_hs; + v_float32x4 v_sector = v_pre_hs - v_setall_f32(6.0f) * v_cvt_f32(v_trunc(v_hs_raw * v_setall_f32(1.0f / 6.0f))); + v_float32x4 v_elem1 = v_hs + v_hs; + + v_float32x4 v_tab0 = v_l + v_elem0; + v_float32x4 v_tab1 = v_l - v_elem0; + v_float32x4 v_tab2 = v_l + v_elem0 - v_elem0 * v_elem1; + v_float32x4 v_tab3 = v_l - v_elem0 + v_elem0 * v_elem1; + + v_float32x4 v_two = v_setall_f32(2.0f); + v_float32x4 v_four = v_setall_f32(4.0f); + + v_h = v_select(v_sector < v_two , v_tab1, + v_select(v_sector <= v_two , v_tab3, + v_select(v_sector <= v_four, v_tab0, v_tab2))); + + v_l = v_select(v_sector < v_one , v_tab3, + v_select(v_sector <= v_two , v_tab0, + v_select(v_sector < v_four, v_tab2, v_tab1))); + + v_s = v_select(v_sector < v_one , v_tab0, + v_select(v_sector < v_two , v_tab2, + v_select(v_sector < v_four, v_tab1, + v_select(v_sector <= v_four, v_tab3, v_tab0)))); + } + #endif + + void operator()(const float* src, float* dst, int n) const + { + int i = 0, bidx = blueIdx, dcn = dstcn; + float alpha = ColorChannel::max(); + n *= 3; + + #if CV_SIMD128 + if (hasSIMD) + { + if (dcn == 3) + { + if (bidx) + { + for (; i <= n - 12; i += 12, dst += dcn * 4) + { + v_float32x4 v_h; + v_float32x4 v_l; + v_float32x4 v_s; + v_load_deinterleave(src + i, v_h, v_l, v_s); + process(v_h, v_l, v_s); + v_store_interleave(dst, v_s, v_l, v_h); + } + } else { + for (; i <= n - 12; i += 12, dst += dcn * 4) + { + v_float32x4 v_h; + v_float32x4 v_l; + v_float32x4 v_s; + v_load_deinterleave(src + i, v_h, v_l, v_s); + process(v_h, v_l, v_s); + v_store_interleave(dst, v_h, v_l, v_s); + } + } + } else { // dcn == 4 + if (bidx) + { + for (; i <= n - 12; i += 12, dst += dcn * 4) + { + v_float32x4 v_h; + v_float32x4 v_l; + v_float32x4 v_s; + v_load_deinterleave(src + i, v_h, v_l, v_s); + process(v_h, v_l, v_s); + v_float32x4 v_a = v_setall_f32(alpha); + v_store_interleave(dst, v_s, v_l, v_h, v_a); + } + } else { + for (; i <= n - 12; i += 12, dst += dcn * 4) + { + v_float32x4 v_h; + v_float32x4 v_l; + v_float32x4 v_s; + v_load_deinterleave(src + i, v_h, v_l, v_s); + process(v_h, v_l, v_s); + v_float32x4 v_a = v_setall_f32(alpha); + v_store_interleave(dst, v_h, v_l, v_s, v_a); + } + } + } + } + #endif + + for( ; i < n; i += 3, dst += dcn ) + { + float h = src[i], l = src[i+1], s = src[i+2]; + float b, g, r; + + if( s == 0 ) + b = g = r = l; + else + { + static const int sector_data[][3]= + {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; + float tab[4]; + int sector; + + float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s; + float p1 = 2*l - p2; + + h *= hscale; + if( h < 0 ) + do h += 6; while( h < 0 ); + else if( h >= 6 ) + do h -= 6; while( h >= 6 ); + + assert( 0 <= h && h < 6 ); + sector = cvFloor(h); + h -= sector; + + tab[0] = p2; + tab[1] = p1; + tab[2] = p1 + (p2 - p1)*(1-h); + tab[3] = p1 + (p2 - p1)*h; + + b = tab[sector_data[sector][0]]; + g = tab[sector_data[sector][1]]; + r = tab[sector_data[sector][2]]; + } + + dst[bidx] = b; + dst[1] = g; + dst[bidx^2] = r; + if( dcn == 4 ) + dst[3] = alpha; + } + } + + int dstcn, blueIdx; + float hscale; + #if CV_SIMD128 + bool hasSIMD; + #endif +}; + + +struct HLS2RGB_b +{ + typedef uchar channel_type; + + HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange) + : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange) + { + #if CV_NEON + v_scale_inv = vdupq_n_f32(1.f/255.f); + v_scale = vdupq_n_f32(255.f); + v_alpha = vdup_n_u8(ColorChannel::max()); + #elif CV_SSE2 + v_scale = _mm_set1_ps(255.f); + v_alpha = _mm_set1_ps(ColorChannel::max()); + v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); + #endif + } + + #if CV_SSE2 + void process(__m128i v_r, __m128i v_g, __m128i v_b, + const __m128& v_coeffs_, + float * buf) const + { + __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero)); + __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero)); + __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero)); + + __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero)); + __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero)); + __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero)); + + __m128 v_coeffs = v_coeffs_; + + v_r0 = _mm_mul_ps(v_r0, v_coeffs); + v_g1 = _mm_mul_ps(v_g1, v_coeffs); + + v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49)); + + v_r1 = _mm_mul_ps(v_r1, v_coeffs); + v_b0 = _mm_mul_ps(v_b0, v_coeffs); + + v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49)); + + v_g0 = _mm_mul_ps(v_g0, v_coeffs); + v_b1 = _mm_mul_ps(v_b1, v_coeffs); + + _mm_store_ps(buf, v_r0); + _mm_store_ps(buf + 4, v_r1); + _mm_store_ps(buf + 8, v_g0); + _mm_store_ps(buf + 12, v_g1); + _mm_store_ps(buf + 16, v_b0); + _mm_store_ps(buf + 20, v_b1); + } + #endif + + void operator()(const uchar* src, uchar* dst, int n) const + { + int i, j, dcn = dstcn; + uchar alpha = ColorChannel::max(); + float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; + #if CV_SSE2 + __m128 v_coeffs = _mm_set_ps(1.f, 1.f/255.f, 1.f/255.f, 1.f); + #endif + + for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) + { + int dn = std::min(n - i, (int)BLOCK_SIZE); + j = 0; + + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24) + { + uint8x8x3_t v_src = vld3_u8(src + j); + uint16x8_t v_t0 = vmovl_u8(v_src.val[0]), + v_t1 = vmovl_u8(v_src.val[1]), + v_t2 = vmovl_u8(v_src.val[2]); + + float32x4x3_t v_dst; + v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))); + v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv); + v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv); + vst3q_f32(buf + j, v_dst); + + v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))); + v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv); + v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); + vst3q_f32(buf + j + 12, v_dst); + } + #elif CV_SSE2 + if (haveSIMD) + { + for ( ; j <= (dn - 8) * 3; j += 24) + { + __m128i v_src0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src + j + 16)); + + process(_mm_unpacklo_epi8(v_src0, v_zero), + _mm_unpackhi_epi8(v_src0, v_zero), + _mm_unpacklo_epi8(v_src1, v_zero), + v_coeffs, + buf + j); + } + } + #endif + for( ; j < dn*3; j += 3 ) + { + buf[j] = src[j]; + buf[j+1] = src[j+1]*(1.f/255.f); + buf[j+2] = src[j+2]*(1.f/255.f); + } + cvt(buf, buf, dn); + + j = 0; + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8) + { + float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12); + uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale))))); + uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale))))); + uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale))))); + + if (dcn == 4) + { + uint8x8x4_t v_dst; + v_dst.val[0] = v_dst0; + v_dst.val[1] = v_dst1; + v_dst.val[2] = v_dst2; + v_dst.val[3] = v_alpha; + vst4_u8(dst, v_dst); + } + else + { + uint8x8x3_t v_dst; + v_dst.val[0] = v_dst0; + v_dst.val[1] = v_dst1; + v_dst.val[2] = v_dst2; + vst3_u8(dst, v_dst); + } + } + #elif CV_SSE2 + if (dcn == 3 && haveSIMD) + { + for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) + { + __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); + __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); + __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale); + __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale); + + __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2), + _mm_cvtps_epi32(v_src3)); + + _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1)); + } + + int jr = j % 3; + if (jr) + dst -= jr, j -= jr; + } + else if (dcn == 4 && haveSIMD) + { + for ( ; j <= (dn * 3 - 12); j += 12, dst += 16) + { + __m128 v_buf0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); + __m128 v_buf1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); + __m128 v_buf2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale); + + __m128 v_ba0 = _mm_unpackhi_ps(v_buf0, v_alpha); + __m128 v_ba1 = _mm_unpacklo_ps(v_buf2, v_alpha); + + __m128i v_src0 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf0, v_ba0, 0x44)); + __m128i v_src1 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba0, v_buf1, 0x4e)), 0x78); + __m128i v_src2 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf1, v_ba1, 0x4e)); + __m128i v_src3 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba1, v_buf2, 0xee)), 0x78); + + __m128i v_dst0 = _mm_packs_epi32(v_src0, v_src1); + __m128i v_dst1 = _mm_packs_epi32(v_src2, v_src3); + + _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1)); + } + + int jr = j % 3; + if (jr) + dst -= jr, j -= jr; + } + #endif + + for( ; j < dn*3; j += 3, dst += dcn ) + { + dst[0] = saturate_cast(buf[j]*255.f); + dst[1] = saturate_cast(buf[j+1]*255.f); + dst[2] = saturate_cast(buf[j+2]*255.f); + if( dcn == 4 ) + dst[3] = alpha; + } + } + } + + int dstcn; + HLS2RGB_f cvt; + #if CV_NEON + float32x4_t v_scale, v_scale_inv; + uint8x8_t v_alpha; + #elif CV_SSE2 + __m128 v_scale; + __m128 v_alpha; + __m128i v_zero; + bool haveSIMD; + #endif +}; + +// +// IPP functions +// + +#if NEED_IPP + +#if !IPP_DISABLE_RGB_HSV +static ippiGeneralFunc ippiRGB2HSVTab[] = +{ + (ippiGeneralFunc)ippiRGBToHSV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHSV_16u_C3R, 0, + 0, 0, 0, 0 +}; +#endif + +static ippiGeneralFunc ippiHSV2RGBTab[] = +{ + (ippiGeneralFunc)ippiHSVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHSVToRGB_16u_C3R, 0, + 0, 0, 0, 0 +}; + +static ippiGeneralFunc ippiRGB2HLSTab[] = +{ + (ippiGeneralFunc)ippiRGBToHLS_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHLS_16u_C3R, 0, + 0, (ippiGeneralFunc)ippiRGBToHLS_32f_C3R, 0, 0 +}; + +static ippiGeneralFunc ippiHLS2RGBTab[] = +{ + (ippiGeneralFunc)ippiHLSToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHLSToRGB_16u_C3R, 0, + 0, (ippiGeneralFunc)ippiHLSToRGB_32f_C3R, 0, 0 +}; + +#endif + +// +// HAL functions +// + +namespace hal +{ + +// 8u, 32f +void cvtBGRtoHSV(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtBGRtoHSV, cv_hal_cvtBGRtoHSV, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isFullRange, isHSV); + +#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 + CV_IPP_CHECK() + { + if(depth == CV_8U && isFullRange) + { + if (isHSV) + { +#if !IPP_DISABLE_RGB_HSV // breaks OCL accuracy tests + if(scn == 3 && !swapBlue) + { + if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height, + IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) ) + return; + } + else if(scn == 4 && !swapBlue) + { + if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) ) + return; + } + else if(scn == 4 && swapBlue) + { + if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 0, 1, 2, depth)) ) + return; + } +#endif + } + else + { + if(scn == 3 && !swapBlue) + { + if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height, + IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) ) + return; + } + else if(scn == 4 && !swapBlue) + { + if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) ) + return; + } + else if(scn == 3 && swapBlue) + { + if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height, + IPPGeneralFunctor(ippiRGB2HLSTab[depth])) ) + return; + } + else if(scn == 4 && swapBlue) + { + if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 0, 1, 2, depth)) ) + return; + } + } + } + } +#endif + + int hrange = depth == CV_32F ? 360 : isFullRange ? 256 : 180; + int blueIdx = swapBlue ? 2 : 0; + if(isHSV) + { + if(depth == CV_8U) + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HSV_b(scn, blueIdx, hrange)); + else + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HSV_f(scn, blueIdx, static_cast(hrange))); + } + else + { + if( depth == CV_8U ) + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HLS_b(scn, blueIdx, hrange)); + else + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HLS_f(scn, blueIdx, static_cast(hrange))); + } +} + +// 8u, 32f +void cvtHSVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtHSVtoBGR, cv_hal_cvtHSVtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isFullRange, isHSV); + +#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 + CV_IPP_CHECK() + { + if (depth == CV_8U && isFullRange) + { + if (isHSV) + { + if(dcn == 3 && !swapBlue) + { + if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height, + IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) ) + return; + } + else if(dcn == 4 && !swapBlue) + { + if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) ) + return; + } + else if(dcn == 3 && swapBlue) + { + if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height, + IPPGeneralFunctor(ippiHSV2RGBTab[depth])) ) + return; + } + else if(dcn == 4 && swapBlue) + { + if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) ) + return; + } + } + else + { + if(dcn == 3 && !swapBlue) + { + if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height, + IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) ) + return; + } + else if(dcn == 4 && !swapBlue) + { + if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) ) + return; + } + else if(dcn == 3 && swapBlue) + { + if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height, + IPPGeneralFunctor(ippiHLS2RGBTab[depth])) ) + return; + } + else if(dcn == 4 && swapBlue) + { + if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) ) + return; + } + } + } + } +#endif + + int hrange = depth == CV_32F ? 360 : isFullRange ? 255 : 180; + int blueIdx = swapBlue ? 2 : 0; + if(isHSV) + { + if( depth == CV_8U ) + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HSV2RGB_b(dcn, blueIdx, hrange)); + else + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HSV2RGB_f(dcn, blueIdx, static_cast(hrange))); + } + else + { + if( depth == CV_8U ) + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HLS2RGB_b(dcn, blueIdx, hrange)); + else + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HLS2RGB_f(dcn, blueIdx, static_cast(hrange))); + } +} + +} // namespace hal + +// +// OCL calls +// + +#ifdef HAVE_OPENCL + +bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full ) +{ + OclHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); + + int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255); + + if(!h.createKernel("HSV2RGB", ocl::imgproc::color_hsv_oclsrc, + format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full ) +{ + OclHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); + + int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255); + + if(!h.createKernel("HLS2RGB", ocl::imgproc::color_hsv_oclsrc, + format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full ) +{ + OclHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); + + float hscale = (_src.depth() == CV_32F ? 360.f : (!full ? 180.f : 256.f))/360.f; + + if(!h.createKernel("RGB2HLS", ocl::imgproc::color_hsv_oclsrc, + format("-D hscale=%ff -D bidx=%d -D dcn=3", hscale, bidx))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full ) +{ + OclHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); + + int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 256); + + cv::String options = (_src.depth() == CV_8U ? + format("-D hrange=%d -D bidx=%d -D dcn=3", hrange, bidx) : + format("-D hscale=%ff -D bidx=%d -D dcn=3", hrange*(1.f/360.f), bidx)); + + if(!h.createKernel("RGB2HSV", ocl::imgproc::color_hsv_oclsrc, options)) + { + return false; + } + + if(_src.depth() == CV_8U) + { + static UMat sdiv_data; + static UMat hdiv_data180; + static UMat hdiv_data256; + static int sdiv_table[256]; + static int hdiv_table180[256]; + static int hdiv_table256[256]; + static volatile bool initialized180 = false, initialized256 = false; + volatile bool & initialized = hrange == 180 ? initialized180 : initialized256; + + if (!initialized) + { + int * const hdiv_table = hrange == 180 ? hdiv_table180 : hdiv_table256, hsv_shift = 12; + UMat & hdiv_data = hrange == 180 ? hdiv_data180 : hdiv_data256; + + sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0; + + int v = 255 << hsv_shift; + if (!initialized180 && !initialized256) + { + for(int i = 1; i < 256; i++ ) + sdiv_table[i] = saturate_cast(v/(1.*i)); + Mat(1, 256, CV_32SC1, sdiv_table).copyTo(sdiv_data); + } + + v = hrange << hsv_shift; + for (int i = 1; i < 256; i++ ) + hdiv_table[i] = saturate_cast(v/(6.*i)); + + Mat(1, 256, CV_32SC1, hdiv_table).copyTo(hdiv_data); + initialized = true; + } + + h.setArg(ocl::KernelArg::PtrReadOnly(sdiv_data)); + h.setArg(hrange == 256 ? ocl::KernelArg::PtrReadOnly(hdiv_data256) : + ocl::KernelArg::PtrReadOnly(hdiv_data180)); + } + + return h.run(); +} + +#endif + +// +// HAL calls +// + +void cvtColorBGR2HLS( InputArray _src, OutputArray _dst, bool swapb, bool fullRange ) +{ + CvtHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); + + hal::cvtBGRtoHSV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, + h.depth, h.scn, swapb, fullRange, false); +} + +void cvtColorBGR2HSV( InputArray _src, OutputArray _dst, bool swapb, bool fullRange ) +{ + CvtHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); + + hal::cvtBGRtoHSV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, + h.depth, h.scn, swapb, fullRange, true); +} + +void cvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange) +{ + if(dcn <= 0) dcn = 3; + CvtHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); + + hal::cvtHSVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, + h.depth, dcn, swapb, fullRange, false); +} + +void cvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange) +{ + if(dcn <= 0) dcn = 3; + CvtHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); + + hal::cvtHSVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, + h.depth, dcn, swapb, fullRange, true); +} + + +} // namespace cv diff --git a/modules/imgproc/src/color_rgb.simd.hpp b/modules/imgproc/src/color_rgb.simd.hpp new file mode 100644 index 0000000000..9245f26d05 --- /dev/null +++ b/modules/imgproc/src/color_rgb.simd.hpp @@ -0,0 +1,1656 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +#include "precomp.hpp" +#include "color.hpp" + +#define IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 1 + +namespace cv +{ + +////////////////// Various 3/4-channel to 3/4-channel RGB transformations ///////////////// + +template struct v_type; + +template<> +struct v_type{ + typedef v_uint8 t; +}; + +template<> +struct v_type{ + typedef v_uint16 t; +}; + +template<> +struct v_type{ + typedef v_float32 t; +}; + +template struct v_set; + +template<> +struct v_set +{ + static inline v_type::t set(uchar x) + { + return vx_setall_u8(x); + } +}; + +template<> +struct v_set +{ + static inline v_type::t set(ushort x) + { + return vx_setall_u16(x); + } +}; + +template<> +struct v_set +{ + static inline v_type::t set(float x) + { + return vx_setall_f32(x); + } +}; + +template +struct RGB2RGB +{ + typedef _Tp channel_type; + typedef typename v_type<_Tp>::t vt; + + RGB2RGB(int _srccn, int _dstcn, int _blueIdx) : + srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx) + { + CV_Assert(srccn == 3 || srccn == 4); + CV_Assert(dstcn == 3 || dstcn == 4); + } + + void operator()(const _Tp* src, _Tp* dst, int n) const + { + int scn = srccn, dcn = dstcn, bi = blueIdx; + int i = 0; + _Tp alphav = ColorChannel<_Tp>::max(); + +#if CV_SIMD + const int vsize = vt::nlanes; + + for(; i <= n-vsize; + i += vsize, src += vsize*scn, dst += vsize*dcn) + { + vt a, b, c, d; + if(scn == 4) + { + v_load_deinterleave(src, a, b, c, d); + } + else + { + v_load_deinterleave(src, a, b, c); + d = v_set<_Tp>::set(alphav); + } + if(bi == 2) + swap(a, c); + + if(dcn == 4) + { + v_store_interleave(dst, a, b, c, d); + } + else + { + v_store_interleave(dst, a, b, c); + } + } + vx_cleanup(); +#endif + for ( ; i < n; i++, src += scn, dst += dcn ) + { + _Tp t0 = src[0], t1 = src[1], t2 = src[2]; + dst[bi ] = t0; + dst[1] = t1; + dst[bi^2] = t2; + if(dcn == 4) + { + _Tp d = scn == 4 ? src[3] : alphav; + dst[3] = d; + } + } + } + + int srccn, dstcn, blueIdx; +}; + + +/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB ////////// + +struct RGB5x52RGB +{ + typedef uchar channel_type; + + RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits) + : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits) + { } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx, gb = greenBits; + int i = 0; + +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint8 vz = vx_setzero_u8(), vn0 = vx_setall_u8(255); + for(; i <= n-vsize; + i += vsize, src += vsize*sizeof(ushort), dst += vsize*dcn) + { + v_uint16 t0 = v_reinterpret_as_u16(vx_load(src)); + v_uint16 t1 = v_reinterpret_as_u16(vx_load(src + + sizeof(ushort)*v_uint16::nlanes)); + + //TODO: shorten registers use when v_interleave is available + v_uint8 r, g, b, a; + v_uint16 b0 = (t0 << 11) >> 8; + v_uint16 b1 = (t1 << 11) >> 8; + b = v_pack(b0, b1); + + v_uint16 g0, g1, r0, r1, a0, a1; + + if( gb == 6 ) + { + g0 = ((t0 >> 5) << 10) >> 8; + g1 = ((t1 >> 5) << 10) >> 8; + + r0 = (t0 >> 11) << 3; + r1 = (t1 >> 11) << 3; + + a = vn0; + } + else + { + g0 = ((t0 >> 5) << 11) >> 8; + g1 = ((t1 >> 5) << 11) >> 8; + + r0 = ((t0 >> 10) << 11) >> 8; + r1 = ((t1 >> 10) << 11) >> 8; + + a0 = t0 >> 15; + a1 = t1 >> 15; + a = v_pack(a0, a1); + a = a != vz; + } + g = v_pack(g0, g1); + r = v_pack(r0, r1); + + if(bidx == 2) + swap(b, r); + + if(dcn == 4) + { + v_store_interleave(dst, b, g, r, a); + } + else + { + v_store_interleave(dst, b, g, r); + } + } + vx_cleanup(); +#endif + + for( ; i < n; i++, src += sizeof(ushort), dst += dcn ) + { + unsigned t = ((const ushort*)src)[0]; + uchar b, g, r, a; + + b = (uchar)(t << 3); + + if( gb == 6 ) + { + g = (uchar)((t >> 3) & ~3); + r = (uchar)((t >> 8) & ~7); + a = 255; + } + else + { + g = (uchar)((t >> 2) & ~7); + r = (uchar)((t >> 7) & ~7); + a = (uchar)(((t & 0x8000) >> 15) * 255); + } + + dst[bidx] = b; + dst[1] = g; + dst[bidx ^ 2] = r; + if( dcn == 4 ) + dst[3] = a; + } + } + + int dstcn, blueIdx, greenBits; +}; + + +struct RGB2RGB5x5 +{ + typedef uchar channel_type; + + RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits) + : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits) + { } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int scn = srccn, bidx = blueIdx, gb = greenBits; + int i = 0; + +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint16 vn3 = vx_setall_u16((ushort)(~3)); + v_uint16 vn7 = vx_setall_u16((ushort)(~7)); + v_uint16 vz = vx_setzero_u16(); + v_uint8 v7 = vx_setall_u8((uchar)(~7)); + for(; i <= n-vsize; + i += vsize, src += vsize*scn, dst += vsize*sizeof(ushort)) + { + v_uint8 r, g, b, a; + if(scn == 3) + { + v_load_deinterleave(src, b, g, r); + a = vx_setzero_u8(); + } + else + { + v_load_deinterleave(src, b, g, r, a); + } + if(bidx == 2) + swap(b, r); + + r = r & v7; + + //TODO: shorten registers use when v_deinterleave is available + v_uint16 r0, r1, g0, g1, b0, b1, a0, a1; + v_expand(r, r0, r1); + v_expand(g, g0, g1); + v_expand(b, b0, b1); + v_expand(a, a0, a1); + + v_uint16 d0, d1; + + b0 = b0 >> 3; + b1 = b1 >> 3; + a0 = (a0 != vz) << 15; + a1 = (a1 != vz) << 15; + + if(gb == 6) + { + d0 = b0 | ((g0 & vn3) << 3) | (r0 << 8); + d1 = b1 | ((g1 & vn3) << 3) | (r1 << 8); + } + else + { + d0 = b0 | ((g0 & vn7) << 2) | (r0 << 7) | a0; + d1 = b1 | ((g1 & vn7) << 2) | (r1 << 7) | a1; + } + + v_store((ushort*)dst, d0); + v_store(((ushort*)dst) + vsize/2, d1); + } + vx_cleanup(); +#endif + for ( ; i < n; i++, src += scn, dst += sizeof(ushort) ) + { + uchar r = src[bidx^2]; + uchar g = src[1]; + uchar b = src[bidx]; + uchar a = scn == 4 ? src[3] : 0; + + ushort d; + if (gb == 6) + { + d = (ushort)((b >> 3)|((g & ~3) << 3)|((r & ~7) << 8)); + } + else + { + d = (ushort)((b >> 3)|((g & ~7) << 2)|((r & ~7) << 7)|(a ? 0x8000 : 0)); + } + ((ushort*)dst)[0] = d; + } + } + + int srccn, blueIdx, greenBits; +}; + + +///////////////////////////////// Color to/from Grayscale //////////////////////////////// + +template +struct Gray2RGB +{ + typedef _Tp channel_type; + typedef typename v_type<_Tp>::t vt; + + Gray2RGB(int _dstcn) : dstcn(_dstcn) {} + void operator()(const _Tp* src, _Tp* dst, int n) const + { + int dcn = dstcn; + int i = 0; + _Tp alpha = ColorChannel<_Tp>::max(); + +#if CV_SIMD + const int vsize = vt::nlanes; + vt valpha = v_set<_Tp>::set(alpha); + for(; i <= n-vsize; + i += vsize, src += vsize, dst += vsize*dcn) + { + vt g = vx_load(src); + + if(dcn == 3) + { + v_store_interleave(dst, g, g, g); + } + else + { + v_store_interleave(dst, g, g, g, valpha); + } + } + vx_cleanup(); +#endif + for ( ; i < n; i++, src++, dst += dcn ) + { + dst[0] = dst[1] = dst[2] = src[0]; + if(dcn == 4) + dst[3] = alpha; + } + } + + int dstcn; +}; + + +struct Gray2RGB5x5 +{ + typedef uchar channel_type; + + Gray2RGB5x5(int _greenBits) : greenBits(_greenBits) + { } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int gb = greenBits; + int i = 0; +#if CV_SIMD + const int vsize = v_uint16::nlanes; + v_uint16 v3 = vx_setall_u16((ushort)(~3)); + for(; i <= n-vsize; + i += vsize, src += vsize, dst += vsize*sizeof(ushort)) + { + v_uint8 t8 = vx_load_low(src); + v_uint16 t = v_expand_low(t8); + + v_uint16 t3 = t >> 3; + + v_uint16 d = t3; + if(gb == 6) + { + d |= ((t & v3) << 3) | (t3 << 11); + } + else + { + d |= (t3 << 5) | (t3 << 10); + } + + v_store((ushort*)dst, d); + } + vx_cleanup(); +#endif + + for( ; i < n; i++, src++, dst += sizeof(ushort)) + { + int t = src[0]; + int t3 = t >> 3; + ushort d; + if( gb == 6 ) + { + d = (ushort)(t3 |((t & ~3) << 3)|(t3 << 11)); + } + else + { + d = (ushort)(t3 |(t3 << 5)|(t3 << 10)); + } + ((ushort*)dst)[0] = d; + } + } + int greenBits; +}; + + +struct RGB5x52Gray +{ + typedef uchar channel_type; + + // can be changed to 15-shift coeffs + static const int BY = B2Y; + static const int GY = G2Y; + static const int RY = R2Y; + static const int shift = yuv_shift; + + RGB5x52Gray(int _greenBits) : greenBits(_greenBits) + { + CV_Assert(BY + GY + RY == (1 << shift)); + } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int gb = greenBits; + int i = 0; +#if CV_SIMD + const int vsize = v_uint16::nlanes; + + v_int16 bg2y; + v_int16 r12y; + v_int16 dummy; + v_zip(vx_setall_s16(BY), vx_setall_s16(GY), bg2y, dummy); + v_zip(vx_setall_s16(RY), vx_setall_s16( 1), r12y, dummy); + v_int16 delta = vx_setall_s16(1 << (shift-1)); + + for(; i <= n-vsize; + i += vsize, src += vsize*sizeof(ushort), dst += vsize) + { + v_uint16 t = vx_load((ushort*)src); + + v_uint16 r, g, b; + b = (t << 11) >> 8; + + if(gb == 5) + { + g = ((t >> 5) << 11) >> 8; + r = ((t >> 10) << 11) >> 8; + } + else + { + g = ((t >> 5) << 10) >> 8; + r = (t >> 11) << 3; + } + + v_uint8 d; + v_uint16 dx; + + v_int16 sr = v_reinterpret_as_s16(r); + v_int16 sg = v_reinterpret_as_s16(g); + v_int16 sb = v_reinterpret_as_s16(b); + + v_int16 bg0, bg1; + v_int16 rd0, rd1; + v_zip(sb, sg, bg0, bg1); + v_zip(sr, delta, rd0, rd1); + + v_uint32 d0, d1; + d0 = v_reinterpret_as_u32(v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)); + d1 = v_reinterpret_as_u32(v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)); + + d0 = d0 >> shift; + d1 = d1 >> shift; + + dx = v_pack(d0, d1); + // high part isn't used + d = v_pack(dx, dx); + + v_store_low(dst, d); + } + vx_cleanup(); +#endif + for( ; i < n; i++, src += sizeof(ushort), dst++) + { + int t = ((ushort*)src)[0]; + uchar r, g, b; + b = (t << 3) & 0xf8; + if( gb == 6 ) + { + g = (t >> 3) & 0xfc; + r = (t >> 8) & 0xf8; + } + else + { + g = (t >> 2) & 0xf8; + r = (t >> 7) & 0xf8; + } + dst[0] = (uchar)CV_DESCALE(b*BY + g*GY + r*RY, shift); + } + } + int greenBits; +}; + + +template struct RGB2Gray +{ + typedef _Tp channel_type; + + RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + { + static const float coeffs0[] = { R2YF, G2YF, B2YF }; + memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) ); + if(blueIdx == 0) + std::swap(coeffs[0], coeffs[2]); + } + + void operator()(const _Tp* src, _Tp* dst, int n) const + { + int scn = srccn; + float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; + for(int i = 0; i < n; i++, src += scn) + dst[i] = saturate_cast<_Tp>(src[0]*cb + src[1]*cg + src[2]*cr); + } + int srccn; + float coeffs[3]; +}; + + +template <> +struct RGB2Gray +{ + typedef float channel_type; + + RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + { + static const float coeffs0[] = { R2YF, G2YF, B2YF }; + for(int i = 0; i < 3; i++) + { + coeffs[i] = _coeffs ? _coeffs[i] : coeffs0[i]; + } + if(blueIdx == 0) + std::swap(coeffs[0], coeffs[2]); + } + + void operator()(const float * src, float * dst, int n) const + { + int scn = srccn, i = 0; + float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; + +#if CV_SIMD + const int vsize = v_float32::nlanes; + v_float32 rv = vx_setall_f32(cr), gv = vx_setall_f32(cg), bv = vx_setall_f32(cb); + for(; i <= n-vsize; + i += vsize, src += vsize*scn, dst += vsize) + { + v_float32 r, g, b, a; + if(scn == 3) + { + v_load_deinterleave(src, b, g, r); + } + else + { + v_load_deinterleave(src, b, g, r, a); + } + + v_float32 d = v_fma(r, rv, v_fma(g, gv, b*bv)); + + v_store(dst, d); + } + vx_cleanup(); +#endif + + for ( ; i < n; i++, src += scn, dst++) + dst[0] = src[0]*cb + src[1]*cg + src[2]*cr; + } + + int srccn; + float coeffs[3]; +}; + +template<> +struct RGB2Gray +{ + typedef uchar channel_type; + + // can be changed to 15-shift coeffs + static const int BY = B2Y; + static const int GY = G2Y; + static const int RY = R2Y; + static const int shift = yuv_shift; + + RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn) + { + const int coeffs0[] = { RY, GY, BY }; + for(int i = 0; i < 3; i++) + coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]); + if(blueIdx == 0) + std::swap(coeffs[0], coeffs[2]); + + CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift)); + } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int scn = srccn; + short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; + int i = 0; + +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_int16 bg2y; + v_int16 r12y; + v_int16 dummy; + v_zip(vx_setall_s16(cb), vx_setall_s16(cg), bg2y, dummy); + v_zip(vx_setall_s16(cr), vx_setall_s16( 1), r12y, dummy); + v_int16 delta = vx_setall_s16(1 << (shift-1)); + + for( ; i <= n-vsize; + i += vsize, src += scn*vsize, dst += vsize) + { + v_uint8 r, g, b, a; + if(scn == 3) + { + v_load_deinterleave(src, b, g, r); + } + else + { + v_load_deinterleave(src, b, g, r, a); + } + + //TODO: shorten registers use when v_deinterleave is available + + v_uint16 r0, r1, g0, g1, b0, b1; + v_expand(r, r0, r1); + v_expand(g, g0, g1); + v_expand(b, b0, b1); + + v_int16 bg00, bg01, bg10, bg11; + v_int16 rd00, rd01, rd10, rd11; + v_zip(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(g0), bg00, bg01); + v_zip(v_reinterpret_as_s16(b1), v_reinterpret_as_s16(g1), bg10, bg11); + v_zip(v_reinterpret_as_s16(r0), delta, rd00, rd01); + v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11); + + v_uint32 y00, y01, y10, y11; + y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift; + y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift; + y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift; + y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift; + + v_uint16 y0, y1; + y0 = v_pack(y00, y01); + y1 = v_pack(y10, y11); + + v_uint8 y = v_pack(y0, y1); + v_store(dst, y); + } + vx_cleanup(); +#endif + + for( ; i < n; i++, src += scn, dst++) + { + int b = src[0], g = src[1], r = src[2]; + uchar y = (uchar)CV_DESCALE(b*cb + g*cg + r*cr, shift); + dst[0] = y; + } + } + + int srccn; + short coeffs[3]; +}; + + +template<> +struct RGB2Gray +{ + typedef ushort channel_type; + + // can be changed to 15-shift coeffs + static const int BY = B2Y; + static const int GY = G2Y; + static const int RY = R2Y; + static const int shift = yuv_shift; + static const int fix_shift = (int)(sizeof(short)*8 - shift); + + RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn) + { + const int coeffs0[] = { RY, GY, BY }; + for(int i = 0; i < 3; i++) + coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]); + if(blueIdx == 0) + std::swap(coeffs[0], coeffs[2]); + + CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift)); + } + + void operator()(const ushort* src, ushort* dst, int n) const + { + int scn = srccn; + short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; + int i = 0; + +#if CV_SIMD + const int vsize = v_uint16::nlanes; + + v_int16 b2y = vx_setall_s16(cb); + v_int16 g2y = vx_setall_s16(cg); + v_int16 r2y = vx_setall_s16(cr); + v_int16 one = vx_setall_s16(1); + v_int16 z = vx_setzero_s16(); + + v_int16 bg2y, r12y; + v_int16 dummy; + v_zip(b2y, g2y, bg2y, dummy); + v_zip(r2y, one, r12y, dummy); + + v_int16 delta = vx_setall_s16(1 << (shift-1)); + + for( ; i <= n-vsize; + i += vsize, src += scn*vsize, dst += vsize) + { + v_uint16 r, g, b, a; + if(scn == 3) + { + v_load_deinterleave(src, b, g, r); + } + else + { + v_load_deinterleave(src, b, g, r, a); + } + + v_int16 sb = v_reinterpret_as_s16(b); + v_int16 sr = v_reinterpret_as_s16(r); + v_int16 sg = v_reinterpret_as_s16(g); + + v_int16 bg0, bg1; + v_int16 rd0, rd1; + v_zip(sb, sg, bg0, bg1); + v_zip(sr, delta, rd0, rd1); + + // fixing 16bit signed multiplication + v_int16 mr, mg, mb; + mr = (sr < z) & r2y; + mg = (sg < z) & g2y; + mb = (sb < z) & b2y; + v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift; + + v_int32 sy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift; + v_int32 sy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift; + + v_int16 y = v_add_wrap(v_pack(sy0, sy1), fixmul); + + v_store((short*)dst, y); + } + vx_cleanup(); +#endif + for( ; i < n; i++, src += scn, dst++) + { + int b = src[0], g = src[1], r = src[2]; + ushort d = (ushort)CV_DESCALE((unsigned)(b*cb + g*cg + r*cr), shift); + dst[0] = d; + } + } + + int srccn; + short coeffs[3]; +}; + + +/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) ////////////// + +template +struct RGBA2mRGBA +{ + typedef _Tp channel_type; + + void operator()(const _Tp* src, _Tp* dst, int n) const + { + _Tp max_val = ColorChannel<_Tp>::max(); + _Tp half_val = ColorChannel<_Tp>::half(); + for( int i = 0; i < n; i++ ) + { + _Tp v0 = *src++; + _Tp v1 = *src++; + _Tp v2 = *src++; + _Tp v3 = *src++; + + *dst++ = (v0 * v3 + half_val) / max_val; + *dst++ = (v1 * v3 + half_val) / max_val; + *dst++ = (v2 * v3 + half_val) / max_val; + *dst++ = v3; + } + } +}; + + +template<> +struct RGBA2mRGBA +{ + typedef uchar channel_type; + + void operator()(const uchar* src, uchar* dst, int n) const + { + const uchar max_val = 255; + const uchar half_val = 128; + + int i = 0; +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000)); + v_uint16 vh = vx_setall_u16(half_val+1); + + // processing 4 registers per loop cycle is about 10% faster + // than processing 1 register + for( ; i <= n-vsize; + i += vsize, src += 4*vsize, dst += 4*vsize) + { + v_uint8 v[4]; + for(int j = 0; j < 4; j++) + v[j] = vx_load(src + j*vsize); + + // r0,g0,b0,a0,r1,g1,b1,a1 => 00,00,00,a0,00,00,00,a1 => + // => 00,00,a0,a0,00,00,a1,a1 + // => a0,a0,a0,a0,a1,a1,a1,a1 + + v_uint16 a16[4]; + for(int j = 0; j < 4; j++) + a16[j] = v_reinterpret_as_u16(v[j] & amask); + + v_uint32 a32[4]; + for(int j = 0; j < 4; j++) + a32[j] = v_reinterpret_as_u32(a16[j] | (a16[j] >> 8)); + + v_uint8 a[4]; + for(int j = 0; j < 4; j++) + a[j] = v_reinterpret_as_u8(a32[j] | (a32[j] >> 16)); + + v_uint16 m[8]; + for(int j = 0; j < 4; j++) + v_mul_expand(v[j], a[j], m[j], m[j+4]); + + for(int j = 0; j < 8; j++) + m[j] += vh; + + // div 255: (v+1+(v>>8))>8 + // +1 is in vh, has no effect on (v>>8) + for(int j = 0; j < 8; j++) + m[j] = (m[j] + (m[j] >> 8)) >> 8; + + v_uint8 d[4]; + for(int j = 0; j < 4; j++) + d[j] = v_pack(m[j], m[j+4]); + + for(int j = 0; j < 4; j++) + d[j] = v_select(amask, a[j], d[j]); + + for(int j = 0; j < 4; j++) + v_store(dst + j*vsize, d[j]); + } + + vx_cleanup(); +#endif + for(; i < n; i++, src += 4, dst += 4 ) + { + uchar v0 = src[0]; + uchar v1 = src[1]; + uchar v2 = src[2]; + uchar v3 = src[3]; + + dst[0] = (v0 * v3 + half_val) / max_val; + dst[1] = (v1 * v3 + half_val) / max_val; + dst[2] = (v2 * v3 + half_val) / max_val; + dst[3] = v3; + } + } +}; + + +template +struct mRGBA2RGBA +{ + typedef _Tp channel_type; + + void operator()(const _Tp* src, _Tp* dst, int n) const + { + _Tp max_val = ColorChannel<_Tp>::max(); + for( int i = 0; i < n; i++ ) + { + _Tp v0 = *src++; + _Tp v1 = *src++; + _Tp v2 = *src++; + _Tp v3 = *src++; + _Tp v3_half = v3 / 2; + + *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v0 * max_val + v3_half) / v3); + *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v1 * max_val + v3_half) / v3); + *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v2 * max_val + v3_half) / v3); + *dst++ = v3; + } + } +}; + + +template<> +struct mRGBA2RGBA +{ + typedef uchar channel_type; + + void operator()(const uchar* src, uchar* dst, int n) const + { + uchar max_val = ColorChannel::max(); + int i = 0; + +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000)); + v_uint8 vmax = vx_setall_u8(max_val); + + for( ; i <= n-vsize/4; + i += vsize/4, src += vsize, dst += vsize) + { + v_uint8 s = vx_load(src + 0*vsize); + + // r0,g0,b0,a0,r1,g1,b1,a1 => 00,00,00,a0,00,00,00,a1 => + // => 00,00,a0,a0,00,00,a1,a1 + // => a0,a0,a0,a0,a1,a1,a1,a1 + v_uint8 a; + v_uint16 a16; + v_uint32 a32; + a16 = v_reinterpret_as_u16(s & amask); + a32 = v_reinterpret_as_u32(a16 | (a16 >> 8)); + a = v_reinterpret_as_u8(a32 | (a32 >> 16)); + + // s *= max_val + v_uint16 s0, s1; + v_mul_expand(s, vmax, s0, s1); + + // s += a/2 + v_uint16 ae0, ae1; + v_expand(a, ae0, ae1); + s0 += ae0 >> 1; s1 += ae1 >> 1; + + // s, a -> u32 -> float + v_uint32 u00, u01, u10, u11; + v_int32 s00, s01, s10, s11; + v_expand(s0, u00, u01); + v_expand(s1, u10, u11); + s00 = v_reinterpret_as_s32(u00); + s01 = v_reinterpret_as_s32(u01); + s10 = v_reinterpret_as_s32(u10); + s11 = v_reinterpret_as_s32(u11); + + v_uint32 ua00, ua01, ua10, ua11; + v_int32 a00, a01, a10, a11; + v_expand(ae0, ua00, ua01); + v_expand(ae1, ua10, ua11); + a00 = v_reinterpret_as_s32(ua00); + a01 = v_reinterpret_as_s32(ua01); + a10 = v_reinterpret_as_s32(ua10); + a11 = v_reinterpret_as_s32(ua11); + + v_float32 fs00, fs01, fs10, fs11; + fs00 = v_cvt_f32(s00); + fs01 = v_cvt_f32(s01); + fs10 = v_cvt_f32(s10); + fs11 = v_cvt_f32(s11); + + v_float32 fa00, fa01, fa10, fa11; + fa00 = v_cvt_f32(a00); + fa01 = v_cvt_f32(a01); + fa10 = v_cvt_f32(a10); + fa11 = v_cvt_f32(a11); + + // float d = (float)s/(float)a + v_float32 fd00, fd01, fd10, fd11; + fd00 = fs00/fa00; + fd01 = fs01/fa01; + fd10 = fs10/fa10; + fd11 = fs11/fa11; + + // d -> u32 -> u8 + v_uint32 ud00, ud01, ud10, ud11; + ud00 = v_reinterpret_as_u32(v_trunc(fd00)); + ud01 = v_reinterpret_as_u32(v_trunc(fd01)); + ud10 = v_reinterpret_as_u32(v_trunc(fd10)); + ud11 = v_reinterpret_as_u32(v_trunc(fd11)); + v_uint16 ud0, ud1; + ud0 = v_pack(ud00, ud01); + ud1 = v_pack(ud10, ud11); + v_uint8 d; + d = v_pack(ud0, ud1); + + // if a == 0 then d = 0 + v_uint8 am; + am = a != vx_setzero_u8(); + d = d & am; + + // put alpha values + d = v_select(amask, a, d); + + v_store(dst, d); + } + + vx_cleanup(); +#endif + for(; i < n; i++, src += 4, dst += 4 ) + { + uchar v0 = src[0]; + uchar v1 = src[1]; + uchar v2 = src[2]; + uchar v3 = src[3]; + + uchar v3_half = v3 / 2; + + dst[0] = (v3==0)? 0 : (v0 * max_val + v3_half) / v3; + dst[1] = (v3==0)? 0 : (v1 * max_val + v3_half) / v3; + dst[2] = (v3==0)? 0 : (v2 * max_val + v3_half) / v3; + dst[3] = v3; + + dst[0] = (v3==0)? 0 : saturate_cast((v0 * max_val + v3_half) / v3); + dst[1] = (v3==0)? 0 : saturate_cast((v1 * max_val + v3_half) / v3); + dst[2] = (v3==0)? 0 : saturate_cast((v2 * max_val + v3_half) / v3); + dst[3] = v3; + } + } +}; + +// +// IPP functions +// + +#if NEED_IPP + +static ippiColor2GrayFunc ippiColor2GrayC3Tab[] = +{ + (ippiColor2GrayFunc)ippiColorToGray_8u_C3C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_C3C1R, 0, + 0, (ippiColor2GrayFunc)ippiColorToGray_32f_C3C1R, 0, 0 +}; + +static ippiColor2GrayFunc ippiColor2GrayC4Tab[] = +{ + (ippiColor2GrayFunc)ippiColorToGray_8u_AC4C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_AC4C1R, 0, + 0, (ippiColor2GrayFunc)ippiColorToGray_32f_AC4C1R, 0, 0 +}; + +static ippiGeneralFunc ippiRGB2GrayC3Tab[] = +{ + (ippiGeneralFunc)ippiRGBToGray_8u_C3C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_C3C1R, 0, + 0, (ippiGeneralFunc)ippiRGBToGray_32f_C3C1R, 0, 0 +}; + +static ippiGeneralFunc ippiRGB2GrayC4Tab[] = +{ + (ippiGeneralFunc)ippiRGBToGray_8u_AC4C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_AC4C1R, 0, + 0, (ippiGeneralFunc)ippiRGBToGray_32f_AC4C1R, 0, 0 +}; + + +#if !IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 +static IppStatus ippiGrayToRGB_C1C3R(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep, IppiSize roiSize) +{ + return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_8u_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize); +} +#endif +static IppStatus ippiGrayToRGB_C1C3R(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, IppiSize roiSize) +{ + return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_16u_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize); +} +static IppStatus ippiGrayToRGB_C1C3R(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, IppiSize roiSize) +{ + return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_32f_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize); +} + +static IppStatus ippiGrayToRGB_C1C4R(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep, IppiSize roiSize, Ipp8u aval) +{ + return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_8u_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval); +} +static IppStatus ippiGrayToRGB_C1C4R(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, IppiSize roiSize, Ipp16u aval) +{ + return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_16u_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval); +} +static IppStatus ippiGrayToRGB_C1C4R(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, IppiSize roiSize, Ipp32f aval) +{ + return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_32f_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval); +} + +struct IPPColor2GrayFunctor +{ + IPPColor2GrayFunctor(ippiColor2GrayFunc _func) : + ippiColorToGray(_func) + { + coeffs[0] = B2YF; + coeffs[1] = G2YF; + coeffs[2] = R2YF; + } + bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const + { + return ippiColorToGray ? CV_INSTRUMENT_FUN_IPP(ippiColorToGray, src, srcStep, dst, dstStep, ippiSize(cols, rows), coeffs) >= 0 : false; + } +private: + ippiColor2GrayFunc ippiColorToGray; + Ipp32f coeffs[3]; +}; + +template +struct IPPGray2BGRFunctor +{ + IPPGray2BGRFunctor(){} + + bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const + { + return ippiGrayToRGB_C1C3R((T*)src, srcStep, (T*)dst, dstStep, ippiSize(cols, rows)) >= 0; + } +}; + +template +struct IPPGray2BGRAFunctor +{ + IPPGray2BGRAFunctor() + { + alpha = ColorChannel::max(); + } + + bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const + { + return ippiGrayToRGB_C1C4R((T*)src, srcStep, (T*)dst, dstStep, ippiSize(cols, rows), alpha) >= 0; + } + + T alpha; +}; + +static IppStatus CV_STDCALL ippiSwapChannels_8u_C3C4Rf(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep, + IppiSize roiSize, const int *dstOrder) +{ + return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_8u_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP8u); +} + +static IppStatus CV_STDCALL ippiSwapChannels_16u_C3C4Rf(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, + IppiSize roiSize, const int *dstOrder) +{ + return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_16u_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP16u); +} + +static IppStatus CV_STDCALL ippiSwapChannels_32f_C3C4Rf(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, + IppiSize roiSize, const int *dstOrder) +{ + return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_32f_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP32f); +} + +// shared +ippiReorderFunc ippiSwapChannelsC3C4RTab[] = +{ + (ippiReorderFunc)ippiSwapChannels_8u_C3C4Rf, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3C4Rf, 0, + 0, (ippiReorderFunc)ippiSwapChannels_32f_C3C4Rf, 0, 0 +}; + +static ippiGeneralFunc ippiCopyAC4C3RTab[] = +{ + (ippiGeneralFunc)ippiCopy_8u_AC4C3R, 0, (ippiGeneralFunc)ippiCopy_16u_AC4C3R, 0, + 0, (ippiGeneralFunc)ippiCopy_32f_AC4C3R, 0, 0 +}; + +// shared +ippiReorderFunc ippiSwapChannelsC4C3RTab[] = +{ + (ippiReorderFunc)ippiSwapChannels_8u_C4C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4C3R, 0, + 0, (ippiReorderFunc)ippiSwapChannels_32f_C4C3R, 0, 0 +}; + +// shared +ippiReorderFunc ippiSwapChannelsC3RTab[] = +{ + (ippiReorderFunc)ippiSwapChannels_8u_C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3R, 0, + 0, (ippiReorderFunc)ippiSwapChannels_32f_C3R, 0, 0 +}; + +#if IPP_VERSION_X100 >= 810 +static ippiReorderFunc ippiSwapChannelsC4RTab[] = +{ + (ippiReorderFunc)ippiSwapChannels_8u_C4R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4R, 0, + 0, (ippiReorderFunc)ippiSwapChannels_32f_C4R, 0, 0 +}; +#endif + +#endif + +// +// HAL functions +// + +namespace hal +{ + +// 8u, 16u, 32f +void cvtBGRtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int scn, int dcn, bool swapBlue) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtBGRtoBGR, cv_hal_cvtBGRtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue); + +#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 + CV_IPP_CHECK() + { + if(scn == 3 && dcn == 4 && !swapBlue) + { + if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) ) + return; + } + else if(scn == 4 && dcn == 3 && !swapBlue) + { + if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) ) + return; + } + else if(scn == 3 && dcn == 4 && swapBlue) + { + if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) ) + return; + } + else if(scn == 4 && dcn == 3 && swapBlue) + { + if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) ) + return; + } + else if(scn == 3 && dcn == 3 && swapBlue) + { + if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height, + IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) ) + return; + } +#if IPP_VERSION_X100 >= 810 + else if(scn == 4 && dcn == 4 && swapBlue) + { + if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height, + IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) ) + return; + } + } +#endif +#endif + + int blueIdx = swapBlue ? 2 : 0; + if( depth == CV_8U ) + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB(scn, dcn, blueIdx)); + else if( depth == CV_16U ) + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB(scn, dcn, blueIdx)); + else + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB(scn, dcn, blueIdx)); +} + +// only 8u +void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int scn, bool swapBlue, int greenBits) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtBGRtoBGR5x5, cv_hal_cvtBGRtoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits); + + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB5x5(scn, swapBlue ? 2 : 0, greenBits)); +} + +// only 8u +void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int dcn, bool swapBlue, int greenBits) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtBGR5x5toBGR, cv_hal_cvtBGR5x5toBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, greenBits); + + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52RGB(dcn, swapBlue ? 2 : 0, greenBits)); +} + +// 8u, 16u, 32f +void cvtBGRtoGray(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int scn, bool swapBlue) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtBGRtoGray, cv_hal_cvtBGRtoGray, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue); + +#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 + CV_IPP_CHECK() + { + if(depth == CV_32F && scn == 3 && !swapBlue) + { + if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPColor2GrayFunctor(ippiColor2GrayC3Tab[depth])) ) + return; + } + else if(depth == CV_32F && scn == 3 && swapBlue) + { + if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPGeneralFunctor(ippiRGB2GrayC3Tab[depth])) ) + return; + } + else if(depth == CV_32F && scn == 4 && !swapBlue) + { + if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPColor2GrayFunctor(ippiColor2GrayC4Tab[depth])) ) + return; + } + else if(depth == CV_32F && scn == 4 && swapBlue) + { + if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPGeneralFunctor(ippiRGB2GrayC4Tab[depth])) ) + return; + } + } +#endif + + int blueIdx = swapBlue ? 2 : 0; + if( depth == CV_8U ) + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray(scn, blueIdx, 0)); + else if( depth == CV_16U ) + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray(scn, blueIdx, 0)); + else + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray(scn, blueIdx, 0)); +} + +// 8u, 16u, 32f +void cvtGraytoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int dcn) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtGraytoBGR, cv_hal_cvtGraytoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn); + +#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 + CV_IPP_CHECK() + { + bool ippres = false; + if(dcn == 3) + { + if( depth == CV_8U ) + { +#if !IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 + ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor()); +#endif + } + else if( depth == CV_16U ) + ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor()); + else + ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor()); + } + else if(dcn == 4) + { + if( depth == CV_8U ) + ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor()); + else if( depth == CV_16U ) + ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor()); + else + ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor()); + } + if(ippres) + return; + } +#endif + + if( depth == CV_8U ) + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB(dcn)); + else if( depth == CV_16U ) + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB(dcn)); + else + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB(dcn)); +} + +// only 8u +void cvtBGR5x5toGray(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int greenBits) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtBGR5x5toGray, cv_hal_cvtBGR5x5toGray, src_data, src_step, dst_data, dst_step, width, height, greenBits); + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52Gray(greenBits)); +} + +// only 8u +void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int greenBits) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtGraytoBGR5x5, cv_hal_cvtGraytoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, greenBits); + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB5x5(greenBits)); +} + +void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtRGBAtoMultipliedRGBA, cv_hal_cvtRGBAtoMultipliedRGBA, src_data, src_step, dst_data, dst_step, width, height); + +#ifdef HAVE_IPP + CV_IPP_CHECK() + { + if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPGeneralFunctor((ippiGeneralFunc)ippiAlphaPremul_8u_AC4R))) + return; + } +#endif + + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGBA2mRGBA()); +} + +void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtMultipliedRGBAtoRGBA, cv_hal_cvtMultipliedRGBAtoRGBA, src_data, src_step, dst_data, dst_step, width, height); + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, mRGBA2RGBA()); +} + +} // namespace hal + +// +// OCL calls +// + +#ifdef HAVE_OPENCL + +bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool reverse ) +{ + OclHelper< Set<3, 4>, Set<3, 4>, Set > h(_src, _dst, dcn); + + if(!h.createKernel("RGB", ocl::imgproc::color_rgb_oclsrc, + format("-D dcn=%d -D bidx=0 -D %s", dcn, reverse ? "REVERSE" : "ORDER"))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits ) +{ + OclHelper< Set<3, 4>, Set<2>, Set > h(_src, _dst, 2); + + if(!h.createKernel("RGB2RGB5x5", ocl::imgproc::color_rgb_oclsrc, + format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, gbits))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int gbits) +{ + OclHelper< Set<2>, Set<3, 4>, Set > h(_src, _dst, dcn); + + if(!h.createKernel("RGB5x52RGB", ocl::imgproc::color_rgb_oclsrc, + format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, gbits))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits) +{ + OclHelper< Set<2>, Set<1>, Set > h(_src, _dst, 1); + + if(!h.createKernel("BGR5x52Gray", ocl::imgproc::color_rgb_oclsrc, + format("-D dcn=1 -D bidx=0 -D greenbits=%d", gbits))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits) +{ + OclHelper< Set<1>, Set<2>, Set > h(_src, _dst, 2); + + if(!h.createKernel("Gray2BGR5x5", ocl::imgproc::color_rgb_oclsrc, + format("-D dcn=2 -D bidx=0 -D greenbits=%d", gbits))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx) +{ + OclHelper< Set<3, 4>, Set<1>, Set > h(_src, _dst, 1); + + int stripeSize = 1; + if(!h.createKernel("RGB2Gray", ocl::imgproc::color_rgb_oclsrc, + format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d", bidx, stripeSize))) + { + return false; + } + + h.globalSize[0] = (h.src.cols + stripeSize - 1)/stripeSize; + return h.run(); +} + +bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn) +{ + OclHelper< Set<1>, Set<3, 4>, Set > h(_src, _dst, dcn); + if(!h.createKernel("Gray2RGB", ocl::imgproc::color_rgb_oclsrc, + format("-D bidx=0 -D dcn=%d", dcn))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst) +{ + OclHelper< Set<4>, Set<4>, Set > h(_src, _dst, 4); + + if(!h.createKernel("RGBA2mRGBA", ocl::imgproc::color_rgb_oclsrc, + "-D dcn=4 -D bidx=3")) + { + return false; + } + + return h.run(); +} + +bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst) +{ + OclHelper< Set<4>, Set<4>, Set > h(_src, _dst, 4); + + if(!h.createKernel("mRGBA2RGBA", ocl::imgproc::color_rgb_oclsrc, + "-D dcn=4 -D bidx=3")) + { + return false; + } + + return h.run(); +} + +#endif + +// +// HAL calls +// + +void cvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb) +{ + CvtHelper< Set<3, 4>, Set<3, 4>, Set > h(_src, _dst, dcn); + + hal::cvtBGRtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, + h.depth, h.scn, dcn, swapb); +} + +void cvtColorBGR25x5( InputArray _src, OutputArray _dst, bool swapb, int gbits) +{ + CvtHelper< Set<3, 4>, Set<2>, Set > h(_src, _dst, 2); + + hal::cvtBGRtoBGR5x5(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, + h.scn, swapb, gbits); +} + +void cvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int gbits) +{ + if(dcn <= 0) dcn = 3; + CvtHelper< Set<2>, Set<3, 4>, Set > h(_src, _dst, dcn); + + hal::cvtBGR5x5toBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, + dcn, swapb, gbits); +} + +void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb) +{ + CvtHelper< Set<3, 4>, Set<1>, Set > h(_src, _dst, 1); + + hal::cvtBGRtoGray(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, + h.depth, h.scn, swapb); +} + +void cvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn) +{ + if(dcn <= 0) dcn = 3; + CvtHelper< Set<1>, Set<3, 4>, Set > h(_src, _dst, dcn); + + hal::cvtGraytoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, h.depth, dcn); +} + +void cvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits) +{ + CvtHelper< Set<2>, Set<1>, Set > h(_src, _dst, 1); + + hal::cvtBGR5x5toGray(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, gbits); +} + +void cvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits) +{ + CvtHelper< Set<1>, Set<2>, Set > h(_src, _dst, 2); + + hal::cvtGraytoBGR5x5(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, gbits); +} + +void cvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst) +{ + CvtHelper< Set<4>, Set<4>, Set > h(_src, _dst, 4); + + hal::cvtRGBAtoMultipliedRGBA(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows); +} + +void cvtColormRGBA2RGBA( InputArray _src, OutputArray _dst) +{ + CvtHelper< Set<4>, Set<4>, Set > h(_src, _dst, 4); + + hal::cvtMultipliedRGBAtoRGBA(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows); +} + +} // namespace cv diff --git a/modules/imgproc/src/color_yuv.simd.hpp b/modules/imgproc/src/color_yuv.simd.hpp new file mode 100644 index 0000000000..7d731378e2 --- /dev/null +++ b/modules/imgproc/src/color_yuv.simd.hpp @@ -0,0 +1,2243 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +#include "precomp.hpp" +#include "color.hpp" + +namespace cv +{ + +//constants for conversion from/to RGB and YUV, YCrCb according to BT.601 + +//to YCbCr +static const float YCBF = 0.564f; // == 1/2/(1-B2YF) +static const float YCRF = 0.713f; // == 1/2/(1-R2YF) +static const int YCBI = 9241; // == YCBF*16384 +static const int YCRI = 11682; // == YCRF*16384 +//to YUV +static const float B2UF = 0.492f; +static const float R2VF = 0.877f; +static const int B2UI = 8061; // == B2UF*16384 +static const int R2VI = 14369; // == R2VF*16384 +//from YUV +static const float U2BF = 2.032f; +static const float U2GF = -0.395f; +static const float V2GF = -0.581f; +static const float V2RF = 1.140f; +static const int U2BI = 33292; +static const int U2GI = -6472; +static const int V2GI = -9519; +static const int V2RI = 18678; +//from YCrCb +static const float CB2BF = 1.773f; +static const float CB2GF = -0.344f; +static const float CR2GF = -0.714f; +static const float CR2RF = 1.403f; +static const int CB2BI = 29049; +static const int CB2GI = -5636; +static const int CR2GI = -11698; +static const int CR2RI = 22987; + +///////////////////////////////////// RGB <-> YCrCb ////////////////////////////////////// + +template struct RGB2YCrCb_f +{ + typedef _Tp channel_type; + + RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) : + srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) + { + static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF }; + static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF }; + for(int i = 0; i < 5; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } + if(blueIdx == 0) + std::swap(coeffs[0], coeffs[2]); + } + + void operator()(const _Tp* src, _Tp* dst, int n) const + { + int scn = srccn, bidx = blueIdx; + int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb + const _Tp delta = ColorChannel<_Tp>::half(); + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + n *= 3; + for(int i = 0; i < n; i += 3, src += scn) + { + _Tp Y = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2); + _Tp Cr = saturate_cast<_Tp>((src[bidx^2] - Y)*C3 + delta); + _Tp Cb = saturate_cast<_Tp>((src[bidx] - Y)*C4 + delta); + dst[i] = Y; dst[i+1+yuvOrder] = Cr; dst[i+2-yuvOrder] = Cb; + } + } + int srccn, blueIdx; + bool isCrCb; + float coeffs[5]; +}; + +template <> +struct RGB2YCrCb_f +{ + typedef float channel_type; + + RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) : + srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) + { + static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF }; + static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF }; + for(int i = 0; i < 5; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } + if(blueIdx == 0) + std::swap(coeffs[0], coeffs[2]); + } + + void operator()(const float * src, float * dst, int n) const + { + int scn = srccn, bidx = blueIdx; + int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb + const float delta = ColorChannel::half(); + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + + int i = 0; +#if CV_SIMD + v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2); + v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4); + v_float32 vdelta = vx_setall_f32(delta); + const int vsize = v_float32::nlanes; + for( ; i <= n-vsize; + i += vsize, src += vsize*scn, dst += vsize*3) + { + v_float32 b, g, r, dummy; + if(scn == 3) + { + v_load_deinterleave(src, b, g, r); + } + else + { + v_load_deinterleave(src, b, g, r, dummy); + } + + v_float32 y, cr, cb; + y = v_fma(b, vc0, v_fma(g, vc1, r*vc2)); + + if(bidx) + std::swap(r, b); + + cr = v_fma(r - y, vc3, vdelta); + cb = v_fma(b - y, vc4, vdelta); + + if(yuvOrder) + { + v_store_interleave(dst, y, cb, cr); + } + else + { + v_store_interleave(dst, y, cr, cb); + } + } + vx_cleanup(); +#endif + for ( ; i < n; i ++, src += scn, dst += 3) + { + float Y = src[0]*C0 + src[1]*C1 + src[2]*C2; + float Cr = (src[bidx^2] - Y)*C3 + delta; + float Cb = (src[bidx] - Y)*C4 + delta; + dst[0 ] = Y; + dst[1+yuvOrder] = Cr; + dst[2-yuvOrder] = Cb; + } + } + + int srccn, blueIdx; + bool isCrCb; + float coeffs[5]; +}; + + +template struct RGB2YCrCb_i +{ + typedef _Tp channel_type; + static const int shift = yuv_shift; + + RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb) + : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) + { + static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI }; + static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI }; + + for(int i = 0; i < 5; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } + if(blueIdx==0) std::swap(coeffs[0], coeffs[2]); + } + void operator()(const _Tp* src, _Tp* dst, int n) const + { + int scn = srccn, bidx = blueIdx; + int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + int delta = ColorChannel<_Tp>::half()*(1 << shift); + n *= 3; + for(int i = 0; i < n; i += 3, src += scn) + { + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, shift); + dst[i] = saturate_cast<_Tp>(Y); + dst[i+1+yuvOrder] = saturate_cast<_Tp>(Cr); + dst[i+2-yuvOrder] = saturate_cast<_Tp>(Cb); + } + } + int srccn, blueIdx; + bool isCrCb; + int coeffs[5]; +}; + + +template<> +struct RGB2YCrCb_i +{ + typedef ushort channel_type; + static const int shift = yuv_shift; + static const int fix_shift = (int)(sizeof(short)*8 - shift); + + RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb) + : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) + { + static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI }; + static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI }; + + for(int i = 0; i < 5; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } + if(blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + } + + void operator()(const ushort* src, ushort* dst, int n) const + { + int scn = srccn, bidx = blueIdx; + int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + int sdelta = ColorChannel::half()*(1 << shift); + int i = 0; +#if CV_SIMD + const int vsize = v_uint16::nlanes; + const int descale = 1 << (shift-1); + + v_int16 b2y = vx_setall_s16((short)C0); + v_int16 g2y = vx_setall_s16((short)C1); + v_int16 r2y = vx_setall_s16((short)C2); + v_int16 one = vx_setall_s16(1); + v_int16 z = vx_setzero_s16(); + + v_int16 bg2y, r12y; + v_int16 dummy; + v_zip(b2y, g2y, bg2y, dummy); + v_zip(r2y, one, r12y, dummy); + + v_int16 vdescale = vx_setall_s16(1 << (shift-1)); + v_int32 vc3 = vx_setall_s32(C3); + v_int32 vc4 = vx_setall_s32(C4); + v_int32 vdd = vx_setall_s32(sdelta + descale); + + for(; i <= n-vsize; + i += vsize, src += vsize*scn, dst += vsize*3) + { + v_uint16 r, g, b, a; + if(scn == 3) + { + v_load_deinterleave(src, b, g, r); + } + else + { + v_load_deinterleave(src, b, g, r, a); + } + + v_uint16 y, cr, cb; + + v_int16 sb = v_reinterpret_as_s16(b); + v_int16 sr = v_reinterpret_as_s16(r); + v_int16 sg = v_reinterpret_as_s16(g); + + v_int16 bg0, bg1; + v_int16 rd0, rd1; + v_zip(sb, sg, bg0, bg1); + v_zip(sr, vdescale, rd0, rd1); + + // fixing 16bit signed multiplication + v_int16 mr, mg, mb; + mr = (sr < z) & r2y; + mg = (sg < z) & g2y; + mb = (sb < z) & b2y; + v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift; + + v_int32 ssy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift; + v_int32 ssy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift; + + y = v_reinterpret_as_u16(v_add_wrap(v_pack(ssy0, ssy1), fixmul)); + + if(bidx) + swap(r, b); + + // (r-Y) and (b-Y) don't fit into int16 or uint16 range + v_uint32 r0, r1, b0, b1; + v_expand(r, r0, r1); + v_expand(b, b0, b1); + + v_uint32 uy0, uy1; + v_expand(y, uy0, uy1); + + v_int32 sr0 = v_reinterpret_as_s32(r0); + v_int32 sr1 = v_reinterpret_as_s32(r1); + v_int32 sb0 = v_reinterpret_as_s32(b0); + v_int32 sb1 = v_reinterpret_as_s32(b1); + v_int32 sy0 = v_reinterpret_as_s32(uy0); + v_int32 sy1 = v_reinterpret_as_s32(uy1); + + sr0 = sr0 - sy0; sr1 = sr1 - sy1; + sb0 = sb0 - sy0; sb1 = sb1 - sy1; + + v_int32 scr0, scr1, scb0, scb1; + + scr0 = (sr0*vc3 + vdd) >> shift; + scr1 = (sr1*vc3 + vdd) >> shift; + scb0 = (sb0*vc4 + vdd) >> shift; + scb1 = (sb1*vc4 + vdd) >> shift; + + // saturate and pack + cr = v_pack_u(scr0, scr1); + cb = v_pack_u(scb0, scb1); + + if(yuvOrder) + { + v_store_interleave(dst, y, cb, cr); + } + else + { + v_store_interleave(dst, y, cr, cb); + } + } + vx_cleanup(); +#endif + for( ; i < n; i++, src += scn, dst += 3) + { + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + sdelta, shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + sdelta, shift); + dst[0] = saturate_cast(Y); + dst[1+yuvOrder] = saturate_cast(Cr); + dst[2-yuvOrder] = saturate_cast(Cb); + } + } + int srccn, blueIdx; + bool isCrCb; + int coeffs[5]; +}; + + +template <> +struct RGB2YCrCb_i +{ + typedef uchar channel_type; + static const int shift = yuv_shift; + + RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb) + : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) + { + static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI }; + static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI }; + for(int i = 0; i < 5; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } + if (blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + } + + void operator()(const uchar * src, uchar * dst, int n) const + { + int scn = srccn, bidx = blueIdx, i = 0; + int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + int delta = ColorChannel::half()*(1 << shift); + +#if CV_SIMD + const int vsize = v_uint8::nlanes; + const int descaleShift = 1 << (shift-1); + v_int16 bg2y; + v_int16 r12y; + v_int16 dummy; + v_zip(vx_setall_s16((short)C0), vx_setall_s16((short)C1), bg2y, dummy); + v_zip(vx_setall_s16((short)C2), vx_setall_s16( 1), r12y, dummy); + + // delta + descaleShift == descaleShift*(half*2+1) + v_int16 c3h, c4h; + const short h21 = (short)(ColorChannel::half()*2+1); + v_zip(vx_setall_s16((short)C3), vx_setall_s16(h21), c3h, dummy); + v_zip(vx_setall_s16((short)C4), vx_setall_s16(h21), c4h, dummy); + + v_int16 vdescale = vx_setall_s16(descaleShift); + + for( ; i <= n-vsize; + i += vsize, src += scn*vsize, dst += 3*vsize) + { + v_uint8 r, g, b, a; + if(scn == 3) + { + v_load_deinterleave(src, b, g, r); + } + else + { + v_load_deinterleave(src, b, g, r, a); + } + + v_uint8 y; + + v_uint16 r0, r1, g0, g1, b0, b1; + v_expand(r, r0, r1); + v_expand(g, g0, g1); + v_expand(b, b0, b1); + + v_int16 sr0, sr1, sg0, sg1, sb0, sb1; + sr0 = v_reinterpret_as_s16(r0); sr1 = v_reinterpret_as_s16(r1); + sg0 = v_reinterpret_as_s16(g0); sg1 = v_reinterpret_as_s16(g1); + sb0 = v_reinterpret_as_s16(b0); sb1 = v_reinterpret_as_s16(b1); + + v_uint32 y00, y01, y10, y11; + { + v_int16 bg00, bg01, bg10, bg11; + v_int16 rd00, rd01, rd10, rd11; + v_zip(sb0, sg0, bg00, bg01); + v_zip(sb1, sg1, bg10, bg11); + v_zip(sr0, vdescale, rd00, rd01); + v_zip(sr1, vdescale, rd10, rd11); + + y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift; + y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift; + y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift; + y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift; + } + + v_uint16 y0, y1; + y0 = v_pack(y00, y01); + y1 = v_pack(y10, y11); + + y = v_pack(y0, y1); + + v_int16 sy0, sy1; + sy0 = v_reinterpret_as_s16(y0); + sy1 = v_reinterpret_as_s16(y1); + + // (r-Y) and (b-Y) don't fit into 8 bit, use 16 bits instead + sr0 = v_sub_wrap(sr0, sy0); + sr1 = v_sub_wrap(sr1, sy1); + sb0 = v_sub_wrap(sb0, sy0); + sb1 = v_sub_wrap(sb1, sy1); + + if(bidx) + { + swap(sr0, sb0); swap(sr1, sb1); + } + + v_int32 cr00, cr01, cr10, cr11; + v_int32 cb00, cb01, cb10, cb11; + + // delta + descaleShift == descaleShift*(half*2+1) + { + v_int16 rd00, rd01, rd10, rd11; + v_int16 bd00, bd01, bd10, bd11; + + v_zip(sr0, vdescale, rd00, rd01); + v_zip(sr1, vdescale, rd10, rd11); + + v_zip(sb0, vdescale, bd00, bd01); + v_zip(sb1, vdescale, bd10, bd11); + + cr00 = v_dotprod(rd00, c3h); + cr01 = v_dotprod(rd01, c3h); + cr10 = v_dotprod(rd10, c3h); + cr11 = v_dotprod(rd11, c3h); + + cb00 = v_dotprod(bd00, c4h); + cb01 = v_dotprod(bd01, c4h); + cb10 = v_dotprod(bd10, c4h); + cb11 = v_dotprod(bd11, c4h); + } + + v_uint8 cr, cb; + + cr00 = cr00 >> shift; + cr01 = cr01 >> shift; + cr10 = cr10 >> shift; + cr11 = cr11 >> shift; + + cb00 = cb00 >> shift; + cb01 = cb01 >> shift; + cb10 = cb10 >> shift; + cb11 = cb11 >> shift; + + v_int16 cr0, cr1, cb0, cb1; + cr0 = v_pack(cr00, cr01); cr1 = v_pack(cr10, cr11); + cb0 = v_pack(cb00, cb01); cb1 = v_pack(cb10, cb11); + + cr = v_pack_u(cr0, cr1); + cb = v_pack_u(cb0, cb1); + + if(yuvOrder) + { + v_store_interleave(dst, y, cb, cr); + } + else + { + v_store_interleave(dst, y, cr, cb); + } + } + vx_cleanup(); +#endif + + for ( ; i < n; i++, src += scn, dst += 3) + { + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, shift); + dst[0] = saturate_cast(Y); + dst[1+yuvOrder] = saturate_cast(Cr); + dst[2-yuvOrder] = saturate_cast(Cb); + } + } + + int srccn, blueIdx, coeffs[5]; + bool isCrCb; +}; + + +template struct YCrCb2RGB_f +{ + typedef _Tp channel_type; + + YCrCb2RGB_f(int _dstcn, int _blueIdx, bool _isCrCb) + : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) + { + static const float coeffs_cbr[] = {CR2RF, CR2GF, CB2GF, CB2BF}; + static const float coeffs_yuv[] = { V2RF, V2GF, U2GF, U2BF}; + for(int i = 0; i < 4; i++) + { + coeffs[i] = isCrCb ? coeffs_cbr[i] : coeffs_yuv[i]; + } + } + void operator()(const _Tp* src, _Tp* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx; + int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb + const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max(); + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + n *= 3; + for(int i = 0; i < n; i += 3, dst += dcn) + { + _Tp Y = src[i]; + _Tp Cr = src[i+1+yuvOrder]; + _Tp Cb = src[i+2-yuvOrder]; + + _Tp b = saturate_cast<_Tp>(Y + (Cb - delta)*C3); + _Tp g = saturate_cast<_Tp>(Y + (Cb - delta)*C2 + (Cr - delta)*C1); + _Tp r = saturate_cast<_Tp>(Y + (Cr - delta)*C0); + + dst[bidx] = b; dst[1] = g; dst[bidx^2] = r; + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + bool isCrCb; + float coeffs[4]; +}; + + +template<> +struct YCrCb2RGB_f +{ + typedef float channel_type; + + YCrCb2RGB_f(int _dstcn, int _blueIdx, bool _isCrCb) + : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) + { + static const float coeffs_cbr[] = {CR2RF, CR2GF, CB2GF, CB2BF}; + static const float coeffs_yuv[] = { V2RF, V2GF, U2GF, U2BF}; + for(int i = 0; i < 4; i++) + { + coeffs[i] = isCrCb ? coeffs_cbr[i] : coeffs_yuv[i]; + } + } + + void operator()(const float* src, float* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx; + int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb + const float delta = ColorChannel::half(), alpha = ColorChannel::max(); + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + + int i = 0; +#if CV_SIMD + v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1); + v_float32 vc2 = vx_setall_f32(C2), vc3 = vx_setall_f32(C3); + v_float32 vdelta = vx_setall_f32(delta); + v_float32 valpha = vx_setall_f32(alpha); + const int vsize = v_float32::nlanes; + for( ; i <= n-vsize; + i += vsize, src += vsize*3, dst += vsize*dcn) + { + v_float32 y, cr, cb; + if(yuvOrder) + v_load_deinterleave(src, y, cb, cr); + else + v_load_deinterleave(src, y, cr, cb); + + v_float32 b, g, r; + + cb -= vdelta; cr -= vdelta; + b = v_fma(cb, vc3, y); + g = v_fma(cr, vc1, v_fma(cb, vc2, y)); + r = v_fma(cr, vc0, y); + + if(bidx) + swap(r, b); + + if(dcn == 3) + v_store_interleave(dst, b, g, r); + else + v_store_interleave(dst, b, g, r, valpha); + } + vx_cleanup(); +#endif + for(; i < n; i++, src += 3, dst += dcn) + { + float Y = src[0]; + float Cr = src[1+yuvOrder]; + float Cb = src[2-yuvOrder]; + + float b = Y + (Cb - delta)*C3; + float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1; + float r = Y + (Cr - delta)*C0; + + dst[bidx] = b; dst[1] = g; dst[bidx^2] = r; + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + bool isCrCb; + float coeffs[4]; +}; + + +template struct YCrCb2RGB_i +{ + typedef _Tp channel_type; + static const int shift = yuv_shift; + + YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb) + : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) + { + static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI}; + static const int coeffs_yuv[] = { V2RI, V2GI, U2GI, U2BI }; + for(int i = 0; i < 4; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } + } + + void operator()(const _Tp* src, _Tp* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx; + int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb + const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max(); + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + n *= 3; + for(int i = 0; i < n; i += 3, dst += dcn) + { + _Tp Y = src[i]; + _Tp Cr = src[i+1+yuvOrder]; + _Tp Cb = src[i+2-yuvOrder]; + + int b = Y + CV_DESCALE((Cb - delta)*C3, shift); + int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift); + int r = Y + CV_DESCALE((Cr - delta)*C0, shift); + + dst[bidx] = saturate_cast<_Tp>(b); + dst[1] = saturate_cast<_Tp>(g); + dst[bidx^2] = saturate_cast<_Tp>(r); + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + bool isCrCb; + int coeffs[4]; +}; + + +template <> +struct YCrCb2RGB_i +{ + typedef uchar channel_type; + static const int shift = yuv_shift; + + YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb) + : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) + { + static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI}; + static const int coeffs_yuv[] = { V2RI, V2GI, U2GI, U2BI }; + for(int i = 0; i < 4; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } + } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx, i = 0; + int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb + const uchar delta = ColorChannel::half(), alpha = ColorChannel::max(); + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint8 valpha = vx_setall_u8(alpha); + v_uint8 vdelta = vx_setall_u8(delta); + const int descaleShift = 1 << (shift - 1); + v_int32 vdescale = vx_setall_s32(descaleShift); + + v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2); + // if YUV then C3 > 2^15, need to subtract it + // to fit in short by short multiplication + v_int16 vc3 = vx_setall_s16(yuvOrder ? (short)(C3-(1 << 15)) : (short)C3); + + for( ; i <= n-vsize; + i += vsize, src += 3*vsize, dst += dcn*vsize) + { + v_uint8 y, cr, cb; + if(yuvOrder) + { + v_load_deinterleave(src, y, cb, cr); + } + else + { + v_load_deinterleave(src, y, cr, cb); + } + + cr = v_sub_wrap(cr, vdelta); + cb = v_sub_wrap(cb, vdelta); + + v_int8 scr = v_reinterpret_as_s8(cr); + v_int8 scb = v_reinterpret_as_s8(cb); + + v_int16 scr0, scr1, scb0, scb1; + v_expand(scr, scr0, scr1); + v_expand(scb, scb0, scb1); + + v_int32 b00, b01, b10, b11; + v_int32 g00, g01, g10, g11; + v_int32 r00, r01, r10, r11; + + v_mul_expand(scb0, vc3, b00, b01); + v_mul_expand(scb1, vc3, b10, b11); + if(yuvOrder) + { + // if YUV then C3 > 2^15 + // so we fix the multiplication + v_int32 cb00, cb01, cb10, cb11; + v_expand(scb0, cb00, cb01); + v_expand(scb1, cb10, cb11); + b00 += cb00 << 15; b01 += cb01 << 15; + b10 += cb10 << 15; b11 += cb11 << 15; + } + + v_int32 t00, t01, t10, t11; + v_mul_expand(scb0, vc2, t00, t01); + v_mul_expand(scb1, vc2, t10, t11); + v_mul_expand(scr0, vc1, g00, g01); + v_mul_expand(scr1, vc1, g10, g11); + g00 += t00; g01 += t01; + g10 += t10; g11 += t11; + v_mul_expand(scr0, vc0, r00, r01); + v_mul_expand(scr1, vc0, r10, r11); + + b00 = (b00 + vdescale) >> shift; b01 = (b01 + vdescale) >> shift; + b10 = (b10 + vdescale) >> shift; b11 = (b11 + vdescale) >> shift; + g00 = (g00 + vdescale) >> shift; g01 = (g01 + vdescale) >> shift; + g10 = (g10 + vdescale) >> shift; g11 = (g11 + vdescale) >> shift; + r00 = (r00 + vdescale) >> shift; r01 = (r01 + vdescale) >> shift; + r10 = (r10 + vdescale) >> shift; r11 = (r11 + vdescale) >> shift; + + v_int16 b0, b1, g0, g1, r0, r1; + b0 = v_pack(b00, b01); b1 = v_pack(b10, b11); + g0 = v_pack(g00, g01); g1 = v_pack(g10, g11); + r0 = v_pack(r00, r01); r1 = v_pack(r10, r11); + + v_uint16 y0, y1; + v_expand(y, y0, y1); + v_int16 sy0, sy1; + sy0 = v_reinterpret_as_s16(y0); + sy1 = v_reinterpret_as_s16(y1); + + b0 = v_add_wrap(b0, sy0); b1 = v_add_wrap(b1, sy1); + g0 = v_add_wrap(g0, sy0); g1 = v_add_wrap(g1, sy1); + r0 = v_add_wrap(r0, sy0); r1 = v_add_wrap(r1, sy1); + + v_uint8 b, g, r; + b = v_pack_u(b0, b1); + g = v_pack_u(g0, g1); + r = v_pack_u(r0, r1); + + if(bidx) + swap(r, b); + + if(dcn == 3) + { + v_store_interleave(dst, b, g, r); + } + else + { + v_store_interleave(dst, b, g, r, valpha); + } + } + vx_cleanup(); +#endif + + for ( ; i < n; i++, src += 3, dst += dcn) + { + uchar Y = src[0]; + uchar Cr = src[1+yuvOrder]; + uchar Cb = src[2-yuvOrder]; + + int b = Y + CV_DESCALE((Cb - delta)*C3, shift); + int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift); + int r = Y + CV_DESCALE((Cr - delta)*C0, shift); + + dst[bidx] = saturate_cast(b); + dst[1] = saturate_cast(g); + dst[bidx^2] = saturate_cast(r); + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + bool isCrCb; + int coeffs[4]; +}; + + +template <> +struct YCrCb2RGB_i +{ + typedef ushort channel_type; + static const int shift = yuv_shift; + + YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb) + : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) + { + static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI}; + static const int coeffs_yuv[] = { V2RI, V2GI, U2GI, U2BI }; + for(int i = 0; i < 4; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } + } + + void operator()(const ushort* src, ushort* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx, i = 0; + int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb + const ushort delta = ColorChannel::half(), alpha = ColorChannel::max(); + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + +#if CV_SIMD + const int vsize = v_uint16::nlanes; + const int descaleShift = 1 << (shift-1); + v_uint16 valpha = vx_setall_u16(alpha); + v_uint16 vdelta = vx_setall_u16(delta); + v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2); + // if YUV then C3 > 2^15, need to subtract it + // to fit in short by short multiplication + v_int16 vc3 = vx_setall_s16(yuvOrder ? (short)(C3-(1 << 15)) : (short)C3); + v_int32 vdescale = vx_setall_s32(descaleShift); + for(; i <= n-vsize; + i += vsize, src += vsize*3, dst += vsize*dcn) + { + v_uint16 y, cr, cb; + if(yuvOrder) + { + v_load_deinterleave(src, y, cb, cr); + } + else + { + v_load_deinterleave(src, y, cr, cb); + } + + v_uint32 uy0, uy1; + v_expand(y, uy0, uy1); + v_int32 y0 = v_reinterpret_as_s32(uy0); + v_int32 y1 = v_reinterpret_as_s32(uy1); + + cr = v_sub_wrap(cr, vdelta); + cb = v_sub_wrap(cb, vdelta); + + v_int32 b0, b1, g0, g1, r0, r1; + + v_int16 scb = v_reinterpret_as_s16(cb); + v_int16 scr = v_reinterpret_as_s16(cr); + v_mul_expand(scb, vc3, b0, b1); + if(yuvOrder) + { + // if YUV then C3 > 2^15 + // so we fix the multiplication + v_int32 cb0, cb1; + v_expand(scb, cb0, cb1); + b0 += cb0 << 15; + b1 += cb1 << 15; + } + v_int32 t0, t1; + v_mul_expand(scb, vc2, t0, t1); + v_mul_expand(scr, vc1, g0, g1); + g0 += t0; g1 += t1; + v_mul_expand(scr, vc0, r0, r1); + + // shifted term doesn't fit into 16 bits, addition is to be done in 32 bits + b0 = ((b0 + vdescale) >> shift) + y0; + b1 = ((b1 + vdescale) >> shift) + y1; + g0 = ((g0 + vdescale) >> shift) + y0; + g1 = ((g1 + vdescale) >> shift) + y1; + r0 = ((r0 + vdescale) >> shift) + y0; + r1 = ((r1 + vdescale) >> shift) + y1; + + // saturate and pack + v_uint16 b, g, r; + b = v_pack_u(b0, b1); + g = v_pack_u(g0, g1); + r = v_pack_u(r0, r1); + + if(bidx) + swap(r, b); + + if(dcn == 3) + { + v_store_interleave(dst, b, g, r); + } + else + { + v_store_interleave(dst, b, g, r, valpha); + } + } + vx_cleanup(); +#endif + + for ( ; i < n; i++, src += 3, dst += dcn) + { + ushort Y = src[0]; + ushort Cr = src[1+yuvOrder]; + ushort Cb = src[2-yuvOrder]; + + int b = Y + CV_DESCALE((Cb - delta)*C3, shift); + int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift); + int r = Y + CV_DESCALE((Cr - delta)*C0, shift); + + dst[bidx] = saturate_cast(b); + dst[1] = saturate_cast(g); + dst[bidx^2] = saturate_cast(r); + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + bool isCrCb; + int coeffs[4]; +}; + + +///////////////////////////////////// YUV420 -> RGB ///////////////////////////////////// + +static const int ITUR_BT_601_CY = 1220542; +static const int ITUR_BT_601_CUB = 2116026; +static const int ITUR_BT_601_CUG = -409993; +static const int ITUR_BT_601_CVG = -852492; +static const int ITUR_BT_601_CVR = 1673527; +static const int ITUR_BT_601_SHIFT = 20; + +// Coefficients for RGB to YUV420p conversion +static const int ITUR_BT_601_CRY = 269484; +static const int ITUR_BT_601_CGY = 528482; +static const int ITUR_BT_601_CBY = 102760; +static const int ITUR_BT_601_CRU = -155188; +static const int ITUR_BT_601_CGU = -305135; +static const int ITUR_BT_601_CBU = 460324; +static const int ITUR_BT_601_CGV = -385875; +static const int ITUR_BT_601_CBV = -74448; + +//R = 1.164(Y - 16) + 1.596(V - 128) +//G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) +//B = 1.164(Y - 16) + 2.018(U - 128) + +//R = (1220542(Y - 16) + 1673527(V - 128) + (1 << 19)) >> 20 +//G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20 +//B = (1220542(Y - 16) + 2116026(U - 128) + (1 << 19)) >> 20 + +static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) +{ + int uu, vv; + uu = int(u) - 128; + vv = int(v) - 128; + + ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * vv; + guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * vv + ITUR_BT_601_CUG * uu; + buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu; +} + +static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v, + v_int32 (&ruv)[4], + v_int32 (&guv)[4], + v_int32 (&buv)[4]) +{ + v_uint8 v128 = vx_setall_u8(128); + v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128)); + v_int8 sv = v_reinterpret_as_s8(v_sub_wrap(v, v128)); + + v_int16 uu0, uu1, vv0, vv1; + v_expand(su, uu0, uu1); + v_expand(sv, vv0, vv1); + v_int32 uu[4], vv[4]; + v_expand(uu0, uu[0], uu[1]); v_expand(uu1, uu[2], uu[3]); + v_expand(vv0, vv[0], vv[1]); v_expand(vv1, vv[2], vv[3]); + + v_int32 vshift = vx_setall_s32(1 << (ITUR_BT_601_SHIFT - 1)); + v_int32 vr = vx_setall_s32(ITUR_BT_601_CVR); + v_int32 vg = vx_setall_s32(ITUR_BT_601_CVG); + v_int32 ug = vx_setall_s32(ITUR_BT_601_CUG); + v_int32 ub = vx_setall_s32(ITUR_BT_601_CUB); + + for (int k = 0; k < 4; k++) + { + ruv[k] = vshift + vr * vv[k]; + guv[k] = vshift + vg * vv[k] + ug * uu[k]; + buv[k] = vshift + ub * uu[k]; + } +} + +static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, const int buv, + uchar& r, uchar& g, uchar& b, uchar& a) +{ + int yy = int(vy); + int y = std::max(0, yy - 16) * ITUR_BT_601_CY; + r = saturate_cast((y + ruv) >> ITUR_BT_601_SHIFT); + g = saturate_cast((y + guv) >> ITUR_BT_601_SHIFT); + b = saturate_cast((y + buv) >> ITUR_BT_601_SHIFT); + a = uchar(0xff); +} + +static inline void yRGBuvToRGBA(const v_uint8& vy, + const v_int32 (&ruv)[4], + const v_int32 (&guv)[4], + const v_int32 (&buv)[4], + v_uint8& rr, v_uint8& gg, v_uint8& bb) +{ + v_uint8 v16 = vx_setall_u8(16); + v_uint8 posY = vy - v16; + v_uint16 yy0, yy1; + v_expand(posY, yy0, yy1); + v_int32 yy[4]; + v_int32 yy00, yy01, yy10, yy11; + v_expand(v_reinterpret_as_s16(yy0), yy[0], yy[1]); + v_expand(v_reinterpret_as_s16(yy1), yy[2], yy[3]); + + v_int32 vcy = vx_setall_s32(ITUR_BT_601_CY); + + v_int32 y[4], r[4], g[4], b[4]; + for(int k = 0; k < 4; k++) + { + y[k] = yy[k]*vcy; + r[k] = (y[k] + ruv[k]) >> ITUR_BT_601_SHIFT; + g[k] = (y[k] + guv[k]) >> ITUR_BT_601_SHIFT; + b[k] = (y[k] + buv[k]) >> ITUR_BT_601_SHIFT; + } + + v_int16 r0, r1, g0, g1, b0, b1; + r0 = v_pack(r[0], r[1]); + r1 = v_pack(r[2], r[3]); + g0 = v_pack(g[0], g[1]); + g1 = v_pack(g[2], g[3]); + b0 = v_pack(b[0], b[1]); + b1 = v_pack(b[2], b[3]); + + rr = v_pack_u(r0, r1); + gg = v_pack_u(g0, g1); + bb = v_pack_u(b0, b1); +} + +template +static inline void cvtYuv42xxp2RGB8(const uchar u, const uchar v, + const uchar vy01, const uchar vy11, const uchar vy02, const uchar vy12, + uchar* row1, uchar* row2) +{ + int ruv, guv, buv; + uvToRGBuv(u, v, ruv, guv, buv); + + uchar r00, g00, b00, a00; + uchar r01, g01, b01, a01; + + yRGBuvToRGBA(vy01, ruv, guv, buv, r00, g00, b00, a00); + yRGBuvToRGBA(vy11, ruv, guv, buv, r01, g01, b01, a01); + + row1[2-bIdx] = r00; + row1[1] = g00; + row1[bIdx] = b00; + if(dcn == 4) + row1[3] = a00; + + row1[dcn+2-bIdx] = r01; + row1[dcn+1] = g01; + row1[dcn+0+bIdx] = b01; + if(dcn == 4) + row1[7] = a01; + + if(is420) + { + uchar r10, g10, b10, a10; + uchar r11, g11, b11, a11; + + yRGBuvToRGBA(vy02, ruv, guv, buv, r10, g10, b10, a10); + yRGBuvToRGBA(vy12, ruv, guv, buv, r11, g11, b11, a11); + + row2[2-bIdx] = r10; + row2[1] = g10; + row2[bIdx] = b10; + if(dcn == 4) + row2[3] = a10; + + row2[dcn+2-bIdx] = r11; + row2[dcn+1] = g11; + row2[dcn+0+bIdx] = b11; + if(dcn == 4) + row2[7] = a11; + } +} + +// bIdx is 0 or 2, uIdx is 0 or 1, dcn is 3 or 4 +template +struct YUV420sp2RGB8Invoker : ParallelLoopBody +{ + uchar * dst_data; + size_t dst_step; + int width; + const uchar* my1, *muv; + size_t stride; + + YUV420sp2RGB8Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _uv) + : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), muv(_uv), stride(_stride) {} + + void operator()(const Range& range) const CV_OVERRIDE + { + const int rangeBegin = range.start * 2; + const int rangeEnd = range.end * 2; + + const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2; + + for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride) + { + uchar* row1 = dst_data + dst_step * j; + uchar* row2 = dst_data + dst_step * (j + 1); + const uchar* y2 = y1 + stride; + + int i = 0; +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint8 a = vx_setall_u8(uchar(0xff)); + for( ; i <= width - 2*vsize; + i += 2*vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2) + { + v_uint8 u, v; + v_load_deinterleave(uv + i, u, v); + + if(uIdx) + { + swap(u, v); + } + + v_uint8 vy[4]; + v_load_deinterleave(y1 + i, vy[0], vy[1]); + v_load_deinterleave(y2 + i, vy[2], vy[3]); + + v_int32 ruv[4], guv[4], buv[4]; + uvToRGBuv(u, v, ruv, guv, buv); + + v_uint8 r[4], g[4], b[4]; + + for(int k = 0; k < 4; k++) + { + yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]); + } + + if(bIdx) + { + for(int k = 0; k < 4; k++) + swap(r[k], b[k]); + } + + // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] + v_uint8 r0_0, r0_1, r1_0, r1_1; + v_zip(r[0], r[1], r0_0, r0_1); + v_zip(r[2], r[3], r1_0, r1_1); + v_uint8 g0_0, g0_1, g1_0, g1_1; + v_zip(g[0], g[1], g0_0, g0_1); + v_zip(g[2], g[3], g1_0, g1_1); + v_uint8 b0_0, b0_1, b1_0, b1_1; + v_zip(b[0], b[1], b0_0, b0_1); + v_zip(b[2], b[3], b1_0, b1_1); + + if(dcn == 4) + { + v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0, a); + v_store_interleave(row1 + 4*vsize, b0_1, g0_1, r0_1, a); + + v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0, a); + v_store_interleave(row2 + 4*vsize, b1_1, g1_1, r1_1, a); + } + else //dcn == 3 + { + v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0); + v_store_interleave(row1 + 3*vsize, b0_1, g0_1, r0_1); + + v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0); + v_store_interleave(row2 + 3*vsize, b1_1, g1_1, r1_1); + } + } + vx_cleanup(); +#endif + for ( ; i < width; i += 2, row1 += dcn*2, row2 += dcn*2) + { + uchar u = uv[i + 0 + uIdx]; + uchar v = uv[i + 1 - uIdx]; + + uchar vy01 = y1[i]; + uchar vy11 = y1[i + 1]; + uchar vy02 = y2[i]; + uchar vy12 = y2[i + 1]; + + cvtYuv42xxp2RGB8(u, v, vy01, vy11, vy02, vy12, row1, row2); + } + } + } +}; + +template +struct YUV420p2RGB8Invoker : ParallelLoopBody +{ + uchar * dst_data; + size_t dst_step; + int width; + const uchar* my1, *mu, *mv; + size_t stride; + int ustepIdx, vstepIdx; + + YUV420p2RGB8Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx) + : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), mu(_u), mv(_v), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {} + + void operator()(const Range& range) const CV_OVERRIDE + { + const int rangeBegin = range.start * 2; + const int rangeEnd = range.end * 2; + + int uvsteps[2] = {width/2, static_cast(stride) - width/2}; + int usIdx = ustepIdx, vsIdx = vstepIdx; + + const uchar* y1 = my1 + rangeBegin * stride; + const uchar* u1 = mu + (range.start / 2) * stride; + const uchar* v1 = mv + (range.start / 2) * stride; + + if(range.start % 2 == 1) + { + u1 += uvsteps[(usIdx++) & 1]; + v1 += uvsteps[(vsIdx++) & 1]; + } + + for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1]) + { + uchar* row1 = dst_data + dst_step * j; + uchar* row2 = dst_data + dst_step * (j + 1); + const uchar* y2 = y1 + stride; + int i = 0; + +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint8 a = vx_setall_u8(uchar(0xff)); + for( ; i <= width/2 - vsize; + i += vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2) + { + v_uint8 u, v; + u = vx_load(u1 + i); + v = vx_load(v1 + i); + + v_uint8 vy[4]; + v_load_deinterleave(y1 + 2*i, vy[0], vy[1]); + v_load_deinterleave(y2 + 2*i, vy[2], vy[3]); + + v_int32 ruv[4], guv[4], buv[4]; + uvToRGBuv(u, v, ruv, guv, buv); + + v_uint8 r[4], g[4], b[4]; + + for(int k = 0; k < 4; k++) + { + yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]); + } + + if(bIdx) + { + for(int k = 0; k < 4; k++) + swap(r[k], b[k]); + } + + // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] + v_uint8 r0_0, r0_1, r1_0, r1_1; + v_zip(r[0], r[1], r0_0, r0_1); + v_zip(r[2], r[3], r1_0, r1_1); + v_uint8 g0_0, g0_1, g1_0, g1_1; + v_zip(g[0], g[1], g0_0, g0_1); + v_zip(g[2], g[3], g1_0, g1_1); + v_uint8 b0_0, b0_1, b1_0, b1_1; + v_zip(b[0], b[1], b0_0, b0_1); + v_zip(b[2], b[3], b1_0, b1_1); + + if(dcn == 4) + { + v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0, a); + v_store_interleave(row1 + 4*vsize, b0_1, g0_1, r0_1, a); + + v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0, a); + v_store_interleave(row2 + 4*vsize, b1_1, g1_1, r1_1, a); + } + else //dcn == 3 + { + v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0); + v_store_interleave(row1 + 3*vsize, b0_1, g0_1, r0_1); + + v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0); + v_store_interleave(row2 + 3*vsize, b1_1, g1_1, r1_1); + } + } + vx_cleanup(); +#endif + for (; i < width / 2; i += 1, row1 += dcn*2, row2 += dcn*2) + { + uchar u = u1[i]; + uchar v = v1[i]; + + uchar vy01 = y1[2 * i]; + uchar vy11 = y1[2 * i + 1]; + uchar vy02 = y2[2 * i]; + uchar vy12 = y2[2 * i + 1]; + + cvtYuv42xxp2RGB8(u, v, vy01, vy11, vy02, vy12, row1, row2); + } + } + } +}; + + +#define MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION (320*240) + +template +inline void cvtYUV420sp2RGB(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _uv) +{ + YUV420sp2RGB8Invoker converter(dst_data, dst_step, dst_width, _stride, _y1, _uv); + if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION) + parallel_for_(Range(0, dst_height/2), converter); + else + converter(Range(0, dst_height/2)); +} + +template +inline void cvtYUV420p2RGB(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx) +{ + YUV420p2RGB8Invoker converter(dst_data, dst_step, dst_width, _stride, _y1, _u, _v, ustepIdx, vstepIdx); + if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION) + parallel_for_(Range(0, dst_height/2), converter); + else + converter(Range(0, dst_height/2)); +} + +///////////////////////////////////// RGB -> YUV420p ///////////////////////////////////// + +static inline uchar rgbToY42x(uchar r, uchar g, uchar b) +{ + const int shifted16 = (16 << ITUR_BT_601_SHIFT); + const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); + int yy = ITUR_BT_601_CRY * r + ITUR_BT_601_CGY * g + ITUR_BT_601_CBY * b + halfShift + shifted16; + + return saturate_cast(yy >> ITUR_BT_601_SHIFT); +} + +static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b) +{ + const int shifted16 = (16 << ITUR_BT_601_SHIFT); + const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); + v_uint16 r0, r1, g0, g1, b0, b1; + v_expand(r, r0, r1); + v_expand(g, g0, g1); + v_expand(b, b0, b1); + + v_uint32 rq[4], gq[4], bq[4]; + v_expand(r0, rq[0], rq[1]); v_expand(r1, rq[2], rq[3]); + v_expand(g0, gq[0], gq[1]); v_expand(g1, gq[2], gq[3]); + v_expand(b0, bq[0], bq[1]); v_expand(b1, bq[2], bq[3]); + + v_uint32 ry = vx_setall_u32(ITUR_BT_601_CRY), gy = vx_setall_u32(ITUR_BT_601_CGY); + v_uint32 by = vx_setall_u32(ITUR_BT_601_CBY), shift = vx_setall_u32(halfShift + shifted16); + + v_uint32 y[4]; + for(int k = 0; k < 4; k++) + { + y[k] = (rq[k]*ry + gq[k]*gy + bq[k]*by + shift) >> ITUR_BT_601_SHIFT; + } + + v_uint16 y0, y1; + y0 = v_pack(y[0], y[1]); + y1 = v_pack(y[2], y[3]); + + return v_pack(y0, y1); +} + +static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v) +{ + const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); + const int shifted128 = (128 << ITUR_BT_601_SHIFT); + int uu = ITUR_BT_601_CRU * r + ITUR_BT_601_CGU * g + ITUR_BT_601_CBU * b + halfShift + shifted128; + int vv = ITUR_BT_601_CBU * r + ITUR_BT_601_CGV * g + ITUR_BT_601_CBV * b + halfShift + shifted128; + + u = saturate_cast(uu >> ITUR_BT_601_SHIFT); + v = saturate_cast(vv >> ITUR_BT_601_SHIFT); +} + +static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1, + const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v) +{ + // [r0, r1, r2, r3,..] => [r0, 0, r2, 0,..] + v_int16 vlowByte = vx_setall_s16(0x00ff); + v_int16 rd0, rd1, gd0, gd1, bd0, bd1; + rd0 = v_reinterpret_as_s16(r0) & vlowByte; + rd1 = v_reinterpret_as_s16(r1) & vlowByte; + gd0 = v_reinterpret_as_s16(g0) & vlowByte; + gd1 = v_reinterpret_as_s16(g1) & vlowByte; + bd0 = v_reinterpret_as_s16(b0) & vlowByte; + bd1 = v_reinterpret_as_s16(b1) & vlowByte; + + v_int32 rq[4], gq[4], bq[4]; + v_expand(rd0, rq[0], rq[1]); + v_expand(rd1, rq[2], rq[3]); + v_expand(gd0, gq[0], gq[1]); + v_expand(gd1, gq[2], gq[3]); + v_expand(bd0, bq[0], bq[1]); + v_expand(bd1, bq[2], bq[3]); + + const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); + const int shifted128 = (128 << ITUR_BT_601_SHIFT); + v_int32 shift = vx_setall_s32(halfShift + shifted128); + v_int32 ru, gu, bu, gv, bv; + ru = vx_setall_s32(ITUR_BT_601_CRU); + gu = vx_setall_s32(ITUR_BT_601_CGU); + gv = vx_setall_s32(ITUR_BT_601_CGV); + bu = vx_setall_s32(ITUR_BT_601_CBU); + bv = vx_setall_s32(ITUR_BT_601_CBV); + + v_int32 uq[4], vq[4]; + for(int k = 0; k < 4; k++) + { + uq[k] = (ru*rq[k] + gu*gq[k] + bu*bq[k] + shift) >> ITUR_BT_601_SHIFT; + vq[k] = (bu*rq[k] + gv*gq[k] + bv*bq[k] + shift) >> ITUR_BT_601_SHIFT; + } + + v_int16 u0, u1, v0, v1; + u0 = v_pack(uq[0], uq[1]); + u1 = v_pack(uq[2], uq[3]); + v0 = v_pack(vq[0], vq[1]); + v1 = v_pack(vq[2], vq[3]); + + u = v_pack_u(u0, u1); + v = v_pack_u(v0, v1); +} + + +struct RGB8toYUV420pInvoker: public ParallelLoopBody +{ + RGB8toYUV420pInvoker(const uchar * _srcData, size_t _srcStep, + uchar * _yData, uchar * _uvData, size_t _dstStep, + int _srcWidth, int _srcHeight, int _scn, bool _swapBlue, bool _swapUV, bool _interleave) + : srcData(_srcData), srcStep(_srcStep), + yData(_yData), uvData(_uvData), dstStep(_dstStep), + srcWidth(_srcWidth), srcHeight(_srcHeight), + srcCn(_scn), swapBlue(_swapBlue), swapUV(_swapUV), interleave(_interleave) { } + + void operator()(const Range& rowRange) const CV_OVERRIDE + { + const int w = srcWidth; + const int h = srcHeight; + const int scn = srcCn; + const uchar* srcRow = (uchar*)0; + uchar* yRow = (uchar*)0, *uRow = (uchar*)0, *vRow = (uchar*)0, *uvRow = (uchar*)0; + for( int sRow = rowRange.start*2; sRow < rowRange.end*2; sRow++) + { + srcRow = srcData + srcStep*sRow; + yRow = yData + dstStep * sRow; + bool evenRow = (sRow % 2) == 0; + if(evenRow) + { + if (interleave) + { + uvRow = uvData + dstStep*(sRow/2); + } + else + { + uRow = uvData + dstStep * (sRow/4) + ((sRow/2) % 2) * (w/2); + vRow = uvData + dstStep * ((sRow + h)/4) + (((sRow + h)/2) % 2) * (w/2); + } + } + int i = 0; +#if CV_SIMD + const int vsize = v_uint8::nlanes; + + for( ; i <= w/2 - vsize; + i += vsize) + { + // processing (2*vsize) pixels at once + v_uint8 b0, b1, g0, g1, r0, r1, a0, a1; + if(scn == 4) + { + v_load_deinterleave(srcRow + 2*4*i + 0*vsize, b0, g0, r0, a0); + v_load_deinterleave(srcRow + 2*4*i + 4*vsize, b1, g1, r1, a1); + } + else // scn == 3 + { + v_load_deinterleave(srcRow + 2*3*i + 0*vsize, b0, g0, r0); + v_load_deinterleave(srcRow + 2*3*i + 3*vsize, b1, g1, r1); + } + + if(swapBlue) + { + swap(b0, r0); swap(b1, r1); + } + + v_uint8 y0, y1; + + y0 = rgbToY42x(r0, g0, b0); + y1 = rgbToY42x(r1, g1, b1); + + v_store(yRow + 2*i + 0*vsize, y0); + v_store(yRow + 2*i + 1*vsize, y1); + + if(evenRow) + { + v_uint8 u, v; + rgbToUV42x(r0, r1, g0, g1, b0, b1, u, v); + + if(swapUV) + { + swap(u, v); + } + + if(interleave) + { + v_store_interleave(uvRow + 2*i, u, v); + } + else + { + v_store(uRow + i, u); + v_store(vRow + i, v); + } + } + } + vx_cleanup(); +#endif + // processing two pixels at once + for( ; i < w/2; i++) + { + uchar b0, g0, r0; + uchar b1, g1, r1; + b0 = srcRow[(2*i+0)*scn + 0]; + g0 = srcRow[(2*i+0)*scn + 1]; + r0 = srcRow[(2*i+0)*scn + 2]; + b1 = srcRow[(2*i+1)*scn + 0]; + g1 = srcRow[(2*i+1)*scn + 1]; + r1 = srcRow[(2*i+1)*scn + 2]; + + if(swapBlue) + { + swap(b0, r0); swap(b1, r1); + } + + uchar y0 = rgbToY42x(r0, g0, b0); + uchar y1 = rgbToY42x(r1, g1, b1); + + yRow[2*i+0] = y0; + yRow[2*i+1] = y1; + + if(evenRow) + { + uchar uu, vv; + rgbToUV42x(r0, g0, b0, uu, vv); + if(swapUV) + { + swap(uu, vv); + } + + if(interleave) + { + uvRow[2*i+0] = uu; + uvRow[2*i+1] = vv; + } + else + { + uRow[i] = uu; + vRow[i] = vv; + } + } + } + } + } + + const uchar * srcData; + size_t srcStep; + uchar *yData, *uvData; + size_t dstStep; + int srcWidth; + int srcHeight; + const int srcCn; + bool swapBlue; + bool swapUV; + bool interleave; +}; + + +///////////////////////////////////// YUV422 -> RGB ///////////////////////////////////// + +// bIdx is 0 or 2; [uIdx, yIdx] is [0, 0], [0, 1], [1, 0]; dcn is 3 or 4 +template +struct YUV422toRGB8Invoker : ParallelLoopBody +{ + uchar * dst_data; + size_t dst_step; + const uchar * src_data; + size_t src_step; + int width; + + YUV422toRGB8Invoker(uchar * _dst_data, size_t _dst_step, + const uchar * _src_data, size_t _src_step, + int _width) + : dst_data(_dst_data), dst_step(_dst_step), src_data(_src_data), src_step(_src_step), width(_width) {} + + void operator()(const Range& range) const CV_OVERRIDE + { + int rangeBegin = range.start; + int rangeEnd = range.end; + + // [yIdx, uIdx] | [uidx, vidx]: + // 0, 0 | 1, 3 + // 0, 1 | 3, 1 + // 1, 0 | 0, 2 + const int uidx = 1 - yIdx + uIdx * 2; + const int vidx = (2 + uidx) % 4; + const uchar* yuv_src = src_data + rangeBegin * src_step; + + for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += src_step) + { + uchar* row = dst_data + dst_step * j; + int i = 0; +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint8 a = vx_setall_u8(uchar(0xff)); + for(; i <= 2*width - 4*vsize; + i += 4*vsize, row += vsize*dcn*2) + { + v_uint8 u, v, vy[2]; + if(yIdx == 1) // UYVY + { + v_load_deinterleave(yuv_src + i, u, vy[0], v, vy[1]); + } + else // YUYV or YVYU + { + v_load_deinterleave(yuv_src + i, vy[0], u, vy[1], v); + if(uIdx == 1) // YVYU + { + swap(u, v); + } + } + + v_int32 ruv[4], guv[4], buv[4]; + uvToRGBuv(u, v, ruv, guv, buv); + + v_uint8 r[2], g[2], b[2]; + + yRGBuvToRGBA(vy[0], ruv, guv, buv, r[0], g[0], b[0]); + yRGBuvToRGBA(vy[1], ruv, guv, buv, r[1], g[1], b[1]); + + if(bIdx) + { + swap(r[0], b[0]); + swap(r[1], b[1]); + } + + // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] + v_uint8 r0_0, r0_1; + v_zip(r[0], r[1], r0_0, r0_1); + v_uint8 g0_0, g0_1; + v_zip(g[0], g[1], g0_0, g0_1); + v_uint8 b0_0, b0_1; + v_zip(b[0], b[1], b0_0, b0_1); + + if(dcn == 4) + { + v_store_interleave(row + 0*vsize, b0_0, g0_0, r0_0, a); + v_store_interleave(row + 4*vsize, b0_1, g0_1, r0_1, a); + } + else //dcn == 3 + { + v_store_interleave(row + 0*vsize, b0_0, g0_0, r0_0); + v_store_interleave(row + 3*vsize, b0_1, g0_1, r0_1); + } + } + vx_cleanup(); +#endif + for (; i < 2 * width; i += 4, row += dcn*2) + { + uchar u = yuv_src[i + uidx]; + uchar v = yuv_src[i + vidx]; + + uchar vy0 = yuv_src[i + yIdx]; + uchar vy1 = yuv_src[i + yIdx + 2]; + + cvtYuv42xxp2RGB8(u, v, vy0, vy1, 0, 0, row, (uchar*)(0)); + } + } + } +}; + +#define MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION (320*240) + +template +inline void cvtYUV422toRGB(uchar * dst_data, size_t dst_step, const uchar * src_data, size_t src_step, + int width, int height) +{ + YUV422toRGB8Invoker converter(dst_data, dst_step, src_data, src_step, width); + if (width * height >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION) + parallel_for_(Range(0, height), converter); + else + converter(Range(0, height)); +} + +// +// HAL functions +// + +namespace hal +{ + +// 8u, 16u, 32f +void cvtBGRtoYUV(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int scn, bool swapBlue, bool isCbCr) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtBGRtoYUV, cv_hal_cvtBGRtoYUV, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr); + +#if defined(HAVE_IPP) +#if !IPP_DISABLE_RGB_YUV + CV_IPP_CHECK() + { + if (scn == 3 && depth == CV_8U && swapBlue && !isCbCr) + { + if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPGeneralFunctor((ippiGeneralFunc)ippiRGBToYUV_8u_C3R))) + return; + } + else if (scn == 3 && depth == CV_8U && !swapBlue && !isCbCr) + { + if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], + (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth))) + return; + } + else if (scn == 4 && depth == CV_8U && swapBlue && !isCbCr) + { + if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], + (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 0, 1, 2, depth))) + return; + } + else if (scn == 4 && depth == CV_8U && !swapBlue && !isCbCr) + { + if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], + (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth))) + return; + } + } +#endif +#endif + + int blueIdx = swapBlue ? 2 : 0; + if( depth == CV_8U ) + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i(scn, blueIdx, isCbCr)); + else if( depth == CV_16U ) + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i(scn, blueIdx, isCbCr)); + else + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_f(scn, blueIdx, isCbCr)); +} + +void cvtYUVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int dcn, bool swapBlue, bool isCbCr) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtYUVtoBGR, cv_hal_cvtYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isCbCr); + + +#if defined(HAVE_IPP) +#if !IPP_DISABLE_YUV_RGB + CV_IPP_CHECK() + { + if (dcn == 3 && depth == CV_8U && swapBlue && !isCbCr) + { + if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPGeneralFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R))) + return; + } + else if (dcn == 3 && depth == CV_8U && !swapBlue && !isCbCr) + { + if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R, + ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth))) + return; + } + else if (dcn == 4 && depth == CV_8U && swapBlue && !isCbCr) + { + if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R, + ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth))) + return; + } + else if (dcn == 4 && depth == CV_8U && !swapBlue && !isCbCr) + { + if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, + IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R, + ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth))) + return; + } + } +#endif +#endif + + int blueIdx = swapBlue ? 2 : 0; + if( depth == CV_8U ) + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i(dcn, blueIdx, isCbCr)); + else if( depth == CV_16U ) + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i(dcn, blueIdx, isCbCr)); + else + CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_f(dcn, blueIdx, isCbCr)); +} + +void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int dst_width, int dst_height, + int dcn, bool swapBlue, int uIdx) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtTwoPlaneYUVtoBGR, cv_hal_cvtTwoPlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); + const uchar* uv = src_data + src_step * static_cast(dst_height); + cvtTwoPlaneYUVtoBGR(src_data, uv, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); +} + +typedef void (*cvt_2plane_yuv_ptr_t)(uchar * /* dst_data*/, + size_t /* dst_step */, + int /* dst_width */, + int /* dst_height */, + size_t /* _stride */, + const uchar* /* _y1 */, + const uchar* /* _uv */); + +void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int dst_width, int dst_height, + int dcn, bool swapBlue, int uIdx) +{ + CV_INSTRUMENT_REGION(); + + // TODO: add hal replacement method + + int blueIdx = swapBlue ? 2 : 0; + + cvt_2plane_yuv_ptr_t cvtPtr; + switch(dcn*100 + blueIdx * 10 + uIdx) + { + case 300: cvtPtr = cvtYUV420sp2RGB<0, 0, 3>; break; + case 301: cvtPtr = cvtYUV420sp2RGB<0, 1, 3>; break; + case 320: cvtPtr = cvtYUV420sp2RGB<2, 0, 3>; break; + case 321: cvtPtr = cvtYUV420sp2RGB<2, 1, 3>; break; + case 400: cvtPtr = cvtYUV420sp2RGB<0, 0, 4>; break; + case 401: cvtPtr = cvtYUV420sp2RGB<0, 1, 4>; break; + case 420: cvtPtr = cvtYUV420sp2RGB<2, 0, 4>; break; + case 421: cvtPtr = cvtYUV420sp2RGB<2, 1, 4>; break; + default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break; + }; + + cvtPtr(dst_data, dst_step, dst_width, dst_height, src_step, y_data, uv_data); +} + +typedef void (*cvt_3plane_yuv_ptr_t)(uchar * /* dst_data */, + size_t /* dst_step */, + int /* dst_width */, + int /* dst_height */, + size_t /* _stride */, + const uchar* /* _y1 */, + const uchar* /* _u */, + const uchar* /* _v */, + int /* ustepIdx */, + int /* vstepIdx */); + +void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int dst_width, int dst_height, + int dcn, bool swapBlue, int uIdx) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtThreePlaneYUVtoBGR, cv_hal_cvtThreePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); + const uchar* u = src_data + src_step * static_cast(dst_height); + const uchar* v = src_data + src_step * static_cast(dst_height + dst_height/4) + (dst_width/2) * ((dst_height % 4)/2); + + int ustepIdx = 0; + int vstepIdx = dst_height % 4 == 2 ? 1 : 0; + + if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); } + int blueIdx = swapBlue ? 2 : 0; + + cvt_3plane_yuv_ptr_t cvtPtr; + switch(dcn*10 + blueIdx) + { + case 30: cvtPtr = cvtYUV420p2RGB<0, 3>; break; + case 32: cvtPtr = cvtYUV420p2RGB<2, 3>; break; + case 40: cvtPtr = cvtYUV420p2RGB<0, 4>; break; + case 42: cvtPtr = cvtYUV420p2RGB<2, 4>; break; + default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break; + }; + + cvtPtr(dst_data, dst_step, dst_width, dst_height, src_step, src_data, u, v, ustepIdx, vstepIdx); +} + +void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int scn, bool swapBlue, int uIdx) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtBGRtoThreePlaneYUV, cv_hal_cvtBGRtoThreePlaneYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx); + uchar * uv_data = dst_data + dst_step * height; + + RGB8toYUV420pInvoker cvt(src_data, src_step, dst_data, uv_data, dst_step, width, height, + scn, swapBlue, uIdx == 2, false); + + if( width * height >= 320*240 ) + parallel_for_(Range(0, height/2), cvt); + else + cvt(Range(0, height/2)); +} + +void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, + uchar * y_data, uchar * uv_data, size_t dst_step, + int width, int height, + int scn, bool swapBlue, int uIdx) +{ + CV_INSTRUMENT_REGION(); + + // TODO: add hal replacement method + + RGB8toYUV420pInvoker cvt(src_data, src_step, y_data, uv_data, dst_step, width, height, + scn, swapBlue, uIdx == 2, true); + + if( width * height >= 320*240 ) + parallel_for_(Range(0, height/2), cvt); + else + cvt(Range(0, height/2)); +} + +typedef void (*cvt_1plane_yuv_ptr_t)(uchar * /* dst_data */, + size_t /* dst_step */, + const uchar * /* src_data */, + size_t /* src_step */, + int /* width */, + int /* height */); + +void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int dcn, bool swapBlue, int uIdx, int ycn) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(cvtOnePlaneYUVtoBGR, cv_hal_cvtOnePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn); + + cvt_1plane_yuv_ptr_t cvtPtr; + int blueIdx = swapBlue ? 2 : 0; + switch(dcn*1000 + blueIdx*100 + uIdx*10 + ycn) + { + case 3000: cvtPtr = cvtYUV422toRGB<0,0,0,3>; break; + case 3001: cvtPtr = cvtYUV422toRGB<0,0,1,3>; break; + case 3010: cvtPtr = cvtYUV422toRGB<0,1,0,3>; break; + case 3200: cvtPtr = cvtYUV422toRGB<2,0,0,3>; break; + case 3201: cvtPtr = cvtYUV422toRGB<2,0,1,3>; break; + case 3210: cvtPtr = cvtYUV422toRGB<2,1,0,3>; break; + case 4000: cvtPtr = cvtYUV422toRGB<0,0,0,4>; break; + case 4001: cvtPtr = cvtYUV422toRGB<0,0,1,4>; break; + case 4010: cvtPtr = cvtYUV422toRGB<0,1,0,4>; break; + case 4200: cvtPtr = cvtYUV422toRGB<2,0,0,4>; break; + case 4201: cvtPtr = cvtYUV422toRGB<2,0,1,4>; break; + case 4210: cvtPtr = cvtYUV422toRGB<2,1,0,4>; break; + default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break; + }; + + cvtPtr(dst_data, dst_step, src_data, src_step, width, height); +} + +} // namespace hal + +// +// OCL calls +// + +#ifdef HAVE_OPENCL + +bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx ) +{ + OclHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); + + if(!h.createKernel("YUV2RGB", ocl::imgproc::color_yuv_oclsrc, + format("-D dcn=%d -D bidx=%d", dcn, bidx))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx ) +{ + OclHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); + + if(!h.createKernel("RGB2YUV", ocl::imgproc::color_yuv_oclsrc, + format("-D dcn=3 -D bidx=%d", bidx))) + { + return false; + } + + return h.run(); +} + +bool oclCvtcolorYCrCb2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx) +{ + OclHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); + + if(!h.createKernel("YCrCb2RGB", ocl::imgproc::color_yuv_oclsrc, + format("-D dcn=%d -D bidx=%d", dcn, bidx))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColorBGR2YCrCb( InputArray _src, OutputArray _dst, int bidx) +{ + OclHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); + + if(!h.createKernel("RGB2YCrCb", ocl::imgproc::color_yuv_oclsrc, + format("-D dcn=3 -D bidx=%d", bidx))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx ) +{ + OclHelper< Set<2>, Set<3, 4>, Set > h(_src, _dst, dcn); + + bool optimized = _src.offset() % 4 == 0 && _src.step() % 4 == 0; + if(!h.createKernel("YUV2RGB_422", ocl::imgproc::color_yuv_oclsrc, + format("-D dcn=%d -D bidx=%d -D uidx=%d -D yidx=%d%s", dcn, bidx, uidx, yidx, + optimized ? " -D USE_OPTIMIZED_LOAD" : ""))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColorYUV2Gray_420( InputArray _src, OutputArray _dst ) +{ + OclHelper< Set<1>, Set<1>, Set, FROM_YUV> h(_src, _dst, 1); + + h.src.rowRange(0, _dst.rows()).copyTo(_dst); + return true; +} + +bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx ) +{ + OclHelper< Set<1>, Set<3, 4>, Set, FROM_YUV > h(_src, _dst, dcn); + + if(!h.createKernel("YUV2RGB_NVx", ocl::imgproc::color_yuv_oclsrc, + format("-D dcn=%d -D bidx=%d -D uidx=%d", dcn, bidx, uidx))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx ) +{ + OclHelper< Set<1>, Set<3, 4>, Set, FROM_YUV > h(_src, _dst, dcn); + + if(!h.createKernel("YUV2RGB_YV12_IYUV", ocl::imgproc::color_yuv_oclsrc, + format("-D dcn=%d -D bidx=%d -D uidx=%d%s", dcn, bidx, uidx, + _src.isContinuous() ? " -D SRC_CONT" : ""))) + { + return false; + } + + return h.run(); +} + +bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx, int uidx ) +{ + OclHelper< Set<3, 4>, Set<1>, Set, TO_YUV > h(_src, _dst, 1); + + if(!h.createKernel("RGB2YUV_YV12_IYUV", ocl::imgproc::color_yuv_oclsrc, + format("-D dcn=1 -D bidx=%d -D uidx=%d", bidx, uidx))) + { + return false; + } + + return h.run(); +} + +#endif + +// +// HAL calls +// + +void cvtColorBGR2YUV(InputArray _src, OutputArray _dst, bool swapb, bool crcb) +{ + CvtHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); + + hal::cvtBGRtoYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, + h.depth, h.scn, swapb, crcb); +} + +void cvtColorYUV2BGR(InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb) +{ + if(dcn <= 0) dcn = 3; + CvtHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); + + hal::cvtYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, + h.depth, dcn, swapb, crcb); +} + +void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn) +{ + CvtHelper< Set<2>, Set<3, 4>, Set > h(_src, _dst, dcn); + + hal::cvtOnePlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, + dcn, swapb, uidx, ycn); +} + +void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi ) +{ + CV_Assert( _src.channels() == 2 && _src.depth() == CV_8U ); + + extractChannel(_src, _dst, coi); +} + +void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx) +{ + CvtHelper< Set<3, 4>, Set<1>, Set, TO_YUV > h(_src, _dst, 1); + + hal::cvtBGRtoThreePlaneYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, + h.scn, swapb, uidx); +} + +void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst ) +{ + CvtHelper< Set<1>, Set<1>, Set, FROM_YUV > h(_src, _dst, 1); + +#ifdef HAVE_IPP +#if IPP_VERSION_X100 >= 201700 + if (CV_INSTRUMENT_FUN_IPP(ippiCopy_8u_C1R_L, h.src.data, (IppSizeL)h.src.step, h.dst.data, (IppSizeL)h.dst.step, + ippiSizeL(h.dstSz.width, h.dstSz.height)) >= 0) + return; +#endif +#endif + h.src(Range(0, h.dstSz.height), Range::all()).copyTo(h.dst); +} + +void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx) +{ + if(dcn <= 0) dcn = 3; + CvtHelper< Set<1>, Set<3, 4>, Set, FROM_YUV> h(_src, _dst, dcn); + + hal::cvtThreePlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.dst.cols, h.dst.rows, + dcn, swapb, uidx); +} + +// http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples +// http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples + +void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx ) +{ + if(dcn <= 0) dcn = 3; + CvtHelper< Set<1>, Set<3, 4>, Set, FROM_YUV> h(_src, _dst, dcn); + + hal::cvtTwoPlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.dst.cols, h.dst.rows, + dcn, swapb, uidx); +} + +void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx ) +{ + int stype = _ysrc.type(); + int depth = CV_MAT_DEPTH(stype); + Size ysz = _ysrc.size(), uvs = _uvsrc.size(); + CV_Assert( dcn == 3 || dcn == 4 ); + CV_Assert( depth == CV_8U ); + CV_Assert( ysz.width == uvs.width * 2 && ysz.height == uvs.height * 2 ); + + Mat ysrc = _ysrc.getMat(), uvsrc = _uvsrc.getMat(); + + _dst.create( ysz, CV_MAKETYPE(depth, dcn)); + Mat dst = _dst.getMat(); + + hal::cvtTwoPlaneYUVtoBGR(ysrc.data, uvsrc.data, ysrc.step, + dst.data, dst.step, dst.cols, dst.rows, + dcn, swapb, uidx); +} + +} // namespace cv