Merge pull request #10869 from savuor:color_cpp_split

color.cpp split (#10869) * initial split is done * files renamed (these names are excluded during compilation) * IPP code moved to corresponding files * splineBuild, splineInterpolate -> color_lab.cpp * Lab, Luv: little refactored * it compiles (didn't check work); Lab OCL code moved to color_lab.cpp * cvtcolor.cl: Lab/Luv part moved to color_lab.cl * cvtcolor.cl: color_rgb.cl extracted * cvtcolor.cl: color_yuv.cl separated * cvtcolor.cl: color_hsv.cl extracted * cvtcolor.cl: extracted to color_lab.cl and color_rgb.cl * helper functions moved to hpp file * Lab, Luv: moved to color_lab.cpp * CPU XYZ: to color_lab.cpp * OCL XYZ: to color_lab.cpp * warning fixed * CvtHelper added * CPU YUV: to color_yuv.cpp, helpers to color.hpp * CPU HLS/HSV: to color_hsv.cpp * CPU BGR2BGR: to color_rgb.cpp * CPU RGB: to color_rgb.cpp * extra arg removed * CPU YUV: to color_yuv.cpp * color code decoded * OclHelper added, some funcs rewritten * color_lab.cpp: refactored to use OclHelper * OCL RGB: to color_rgb.cpp * OCL HLS/HSV: to color_hsv.cpp * OCL YUV: to color_yuv.cpp * OCL YUV planes: to color_yuv.cpp * OCL: color code reduced * licence to demosaicing.cpp * IPP func tables to color_rgb.cpp * code cleanup * HAVE_OPENCL ifdefs added * helpers made more common * fixed two plane YUV with separate mats * fixed warning in gcc7.2.0 * precomp header fixed * color space classification functions fixed * helpers fixed * rename: isSRGB -> is_sRGB
2025-06-06 00:43:52 +08:00 · 2018-03-15 14:10:40 +03:00 · 2018-03-15 14:10:40 +03:00 · 64916d3d83
commit 64916d3d83
parent c727e8a4d0
12 changed files with 14421 additions and 13392 deletions
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
--- a/modules/imgproc/src/color.hpp
+++ b/modules/imgproc/src/color.hpp
@ -0,0 +1,668 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "opencv2/imgproc.hpp"
+#include "opencv2/core/utility.hpp"
+#include <limits>
+#include "opencl_kernels_imgproc.hpp"
+#include "hal_replacement.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+#include "opencv2/core/softfloat.hpp"
+
+#define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
+
+namespace cv
+{
+
+//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
+const float B2YF = 0.114f;
+const float G2YF = 0.587f;
+const float R2YF = 0.299f;
+
+enum
+{
+    yuv_shift = 14,
+    xyz_shift = 12,
+    R2Y = 4899, // == R2YF*16384
+    G2Y = 9617, // == G2YF*16384
+    B2Y = 1868, // == B2YF*16384
+    BLOCK_SIZE = 256
+};
+
+template<typename _Tp> struct ColorChannel
+{
+    typedef float worktype_f;
+    static _Tp max() { return std::numeric_limits<_Tp>::max(); }
+    static _Tp half() { return (_Tp)(max()/2 + 1); }
+};
+
+template<> struct ColorChannel<float>
+{
+    typedef float worktype_f;
+    static float max() { return 1.f; }
+    static float half() { return 0.5f; }
+};
+
+/*template<> struct ColorChannel<double>
+{
+    typedef double worktype_f;
+    static double max() { return 1.; }
+    static double half() { return 0.5; }
+};*/
+
+//
+// Helper functions
+//
+
+namespace {
+
+inline bool isHSV(int code)
+{
+    switch(code)
+    {
+    case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
+    case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
+        return true;
+    default:
+        return false;
+    }
+}
+
+inline bool isLab(int code)
+{
+    switch (code)
+    {
+    case COLOR_Lab2BGR: case COLOR_Lab2RGB: case COLOR_Lab2LBGR: case COLOR_Lab2LRGB:
+    case COLOR_BGR2Lab: case COLOR_RGB2Lab: case COLOR_LBGR2Lab: case COLOR_LRGB2Lab:
+        return true;
+    default:
+        return false;
+    }
+}
+
+inline bool is_sRGB(int code)
+{
+    switch (code)
+    {
+    case COLOR_BGR2Lab: case COLOR_RGB2Lab: case COLOR_BGR2Luv: case COLOR_RGB2Luv:
+    case COLOR_Lab2BGR: case COLOR_Lab2RGB: case COLOR_Luv2BGR: case COLOR_Luv2RGB:
+        return true;
+    default:
+        return false;
+    }
+}
+
+inline bool swapBlue(int code)
+{
+    switch (code)
+    {
+    case COLOR_BGR2BGRA: case COLOR_BGRA2BGR:
+    case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555:
+    case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA:
+    case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY:
+    case COLOR_BGR2YCrCb: case COLOR_BGR2YUV:
+    case COLOR_YCrCb2BGR: case COLOR_YUV2BGR:
+    case COLOR_BGR2XYZ: case COLOR_XYZ2BGR:
+    case COLOR_BGR2HSV: case COLOR_BGR2HLS: case COLOR_BGR2HSV_FULL: case COLOR_BGR2HLS_FULL:
+    case COLOR_YUV2BGR_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2BGR_IYUV: case COLOR_YUV2BGRA_IYUV:
+    case COLOR_YUV2BGR_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2BGRA_NV12:
+    case COLOR_Lab2BGR: case COLOR_Luv2BGR: case COLOR_Lab2LBGR: case COLOR_Luv2LBGR:
+    case COLOR_BGR2Lab: case COLOR_BGR2Luv: case COLOR_LBGR2Lab: case COLOR_LBGR2Luv:
+    case COLOR_HSV2BGR: case COLOR_HLS2BGR: case COLOR_HSV2BGR_FULL: case COLOR_HLS2BGR_FULL:
+    case COLOR_YUV2BGR_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2BGR_YUY2:
+    case COLOR_YUV2BGRA_YUY2:  case COLOR_YUV2BGR_YVYU: case COLOR_YUV2BGRA_YVYU:
+    case COLOR_BGR2YUV_IYUV: case COLOR_BGRA2YUV_IYUV: case COLOR_BGR2YUV_YV12: case COLOR_BGRA2YUV_YV12:
+        return false;
+    default:
+        return true;
+    }
+}
+
+inline bool isFullRangeHSV(int code)
+{
+    switch (code)
+    {
+    case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL:
+    case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL:
+        return true;
+    default:
+        return false;
+    }
+}
+
+inline int dstChannels(int code)
+{
+    switch( code )
+    {
+        case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2RGBA:
+        case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
+        case COLOR_GRAY2BGRA:
+        case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12:
+        case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
+        case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
+        case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2:
+
+            return 4;
+
+        case COLOR_BGRA2BGR: case COLOR_RGBA2BGR: case COLOR_RGB2BGR:
+        case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
+        case COLOR_GRAY2BGR:
+        case COLOR_YUV2BGR_NV21: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12:
+        case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV:
+        case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU:
+        case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2:
+
+            return 3;
+
+        default:
+            return 0;
+    }
+}
+
+inline int greenBits(int code)
+{
+    switch( code )
+    {
+        case COLOR_BGR2BGR565: case COLOR_RGB2BGR565: case COLOR_BGRA2BGR565: case COLOR_RGBA2BGR565:
+        case COLOR_BGR5652BGR: case COLOR_BGR5652RGB: case COLOR_BGR5652BGRA: case COLOR_BGR5652RGBA:
+        case COLOR_BGR5652GRAY: case COLOR_GRAY2BGR565:
+
+            return 6;
+
+        case COLOR_BGR2BGR555: case COLOR_RGB2BGR555: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR555:
+        case COLOR_BGR5552BGR: case COLOR_BGR5552RGB: case COLOR_BGR5552BGRA: case COLOR_BGR5552RGBA:
+        case COLOR_BGR5552GRAY: case COLOR_GRAY2BGR555:
+
+            return 5;
+
+        default:
+            return 0;
+    }
+}
+
+inline int uIndex(int code)
+{
+    switch( code )
+    {
+        case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12:
+
+            return 2;
+
+        case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
+        case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
+        case COLOR_YUV2BGR_NV21:  case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21:
+        case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
+
+            return 1;
+
+        case COLOR_YUV2BGR_NV12:  case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12:
+        case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
+        case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
+        case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2:
+
+            return 0;
+
+        default:
+            return -1;
+    }
+}
+
+} // namespace::
+
+template<int i0, int i1 = -1, int i2 = -1>
+struct Set
+{
+    static bool contains(int i)
+    {
+        return (i == i0 || i == i1 || i == i2);
+    }
+};
+
+template<int i0, int i1>
+struct Set<i0, i1, -1>
+{
+    static bool contains(int i)
+    {
+        return (i == i0 || i == i1);
+    }
+};
+
+template<int i0>
+struct Set<i0, -1, -1>
+{
+    static bool contains(int i)
+    {
+        return (i == i0);
+    }
+};
+
+enum SizePolicy
+{
+    TO_YUV, FROM_YUV, NONE
+};
+
+template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE >
+struct CvtHelper
+{
+    CvtHelper(InputArray _src, OutputArray _dst, int dcn)
+    {
+        int stype = _src.type();
+        scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype);
+
+        CV_Assert( VScn::contains(scn) && VDcn::contains(dcn) && VDepth::contains(depth) );
+
+        if (_src.getObj() == _dst.getObj()) // inplace processing (#6653)
+            _src.copyTo(src);
+        else
+            src = _src.getMat();
+        Size sz = src.size();
+        switch (sizePolicy)
+        {
+        case TO_YUV:
+            CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0);
+            dstSz = Size(sz.width, sz.height / 2 * 3);
+            break;
+        case FROM_YUV:
+            CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0);
+            dstSz = Size(sz.width, sz.height * 2 / 3);
+            break;
+        case NONE:
+        default:
+            dstSz = sz;
+            break;
+        }
+        _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
+        dst = _dst.getMat();
+    }
+    Mat src, dst;
+    int depth, scn;
+    Size dstSz;
+};
+
+#ifdef HAVE_OPENCL
+
+template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE >
+struct OclHelper
+{
+    OclHelper( InputArray _src, OutputArray _dst, int dcn)
+    {
+        src = _src.getUMat();
+        Size sz = src.size(), dstSz;
+        int scn = src.channels();
+        int depth = src.depth();
+
+        CV_Assert( VScn::contains(scn) && VDcn::contains(dcn) && VDepth::contains(depth) );
+        switch (sizePolicy)
+        {
+        case TO_YUV:
+            CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
+            dstSz = Size(sz.width, sz.height / 2 * 3);
+            break;
+        case FROM_YUV:
+            CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 );
+            dstSz = Size(sz.width, sz.height * 2 / 3);
+            break;
+        case NONE:
+        default:
+            dstSz = sz;
+            break;
+        }
+
+        _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
+        dst = _dst.getUMat();
+    }
+
+    bool createKernel(cv::String name, ocl::ProgramSource& source, cv::String options)
+    {
+        ocl::Device dev = ocl::Device::getDefault();
+        int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
+        int pxPerWIx = 1;
+
+        cv::String baseOptions = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ",
+                                        src.depth(), src.channels(), pxPerWIy);
+
+        switch (sizePolicy)
+        {
+        case TO_YUV:
+            if (dev.isIntel() &&
+                    src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 &&
+                    dst.step % 4 == 0 && dst.offset % 4 == 0)
+            {
+                pxPerWIx = 2;
+            }
+            globalSize[0] = (size_t)dst.cols/(2*pxPerWIx);
+            globalSize[1] = ((size_t)dst.rows/3 + pxPerWIy - 1) / pxPerWIy;
+            baseOptions += format("-D PIX_PER_WI_X=%d ", pxPerWIx);
+            break;
+        case FROM_YUV:
+            globalSize[0] = (size_t)dst.cols/2;
+            globalSize[1] = ((size_t)dst.rows/2 + pxPerWIy - 1) / pxPerWIy;
+            break;
+        case NONE:
+        default:
+            globalSize[0] = (size_t)src.cols;
+            globalSize[1] = ((size_t)src.rows + pxPerWIy - 1) / pxPerWIy;
+            break;
+        }
+
+        k.create(name.c_str(), source, baseOptions + options);
+
+        if(k.empty())
+            return false;
+
+        nArgs = k.set(0, ocl::KernelArg::ReadOnlyNoSize(src));
+        nArgs = k.set(nArgs, ocl::KernelArg::WriteOnly(dst));
+        return true;
+    }
+
+    bool run()
+    {
+        return k.run(2, globalSize, NULL, false);
+    }
+
+    template<typename T>
+    void setArg(const T& arg)
+    {
+        nArgs = k.set(nArgs, arg);
+    }
+
+    UMat src, dst;
+    ocl::Kernel k;
+    size_t globalSize[2];
+    int nArgs;
+};
+
+#endif
+
+///////////////////////////// Top-level template function ////////////////////////////////
+
+template <typename Cvt>
+class CvtColorLoop_Invoker : public ParallelLoopBody
+{
+    typedef typename Cvt::channel_type _Tp;
+public:
+
+    CvtColorLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt) :
+        ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_),
+        width(width_), cvt(_cvt)
+    {
+    }
+
+    virtual void operator()(const Range& range) const
+    {
+        CV_TRACE_FUNCTION();
+
+        const uchar* yS = src_data + static_cast<size_t>(range.start) * src_step;
+        uchar* yD = dst_data + static_cast<size_t>(range.start) * dst_step;
+
+        for( int i = range.start; i < range.end; ++i, yS += src_step, yD += dst_step )
+            cvt(reinterpret_cast<const _Tp*>(yS), reinterpret_cast<_Tp*>(yD), width);
+    }
+
+private:
+    const uchar * src_data;
+    const size_t src_step;
+    uchar * dst_data;
+    const size_t dst_step;
+    const int width;
+    const Cvt& cvt;
+
+    const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&);
+};
+
+template <typename Cvt>
+void CvtColorLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
+{
+    parallel_for_(Range(0, height),
+                  CvtColorLoop_Invoker<Cvt>(src_data, src_step, dst_data, dst_step, width, cvt),
+                  (width * height) / static_cast<double>(1<<16));
+}
+
+#if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700)
+#  define NEED_IPP 1
+#else
+#  define NEED_IPP 0
+#endif
+
+#if NEED_IPP
+
+#define MAX_IPP8u   255
+#define MAX_IPP16u  65535
+#define MAX_IPP32f  1.0
+
+typedef IppStatus (CV_STDCALL* ippiReorderFunc)(const void *, int, void *, int, IppiSize, const int *);
+typedef IppStatus (CV_STDCALL* ippiGeneralFunc)(const void *, int, void *, int, IppiSize);
+typedef IppStatus (CV_STDCALL* ippiColor2GrayFunc)(const void *, int, void *, int, IppiSize, const Ipp32f *);
+
+template <typename Cvt>
+class CvtColorIPPLoop_Invoker :
+        public ParallelLoopBody
+{
+public:
+
+    CvtColorIPPLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt, bool *_ok) :
+        ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), width(width_), cvt(_cvt), ok(_ok)
+    {
+        *ok = true;
+    }
+
+    virtual void operator()(const Range& range) const
+    {
+        const void *yS = src_data + src_step * range.start;
+        void *yD = dst_data + dst_step * range.start;
+        if( !cvt(yS, static_cast<int>(src_step), yD, static_cast<int>(dst_step), width, range.end - range.start) )
+            *ok = false;
+        else
+        {
+            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
+        }
+    }
+
+private:
+    const uchar * src_data;
+    const size_t src_step;
+    uchar * dst_data;
+    const size_t dst_step;
+    const int width;
+    const Cvt& cvt;
+    bool *ok;
+
+    const CvtColorIPPLoop_Invoker& operator= (const CvtColorIPPLoop_Invoker&);
+};
+
+
+template <typename Cvt>
+bool CvtColorIPPLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
+{
+    bool ok;
+    parallel_for_(Range(0, height), CvtColorIPPLoop_Invoker<Cvt>(src_data, src_step, dst_data, dst_step, width, cvt, &ok), (width * height)/(double)(1<<16) );
+    return ok;
+}
+
+
+template <typename Cvt>
+bool CvtColorIPPLoopCopy(const uchar * src_data, size_t src_step, int src_type, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
+{
+    Mat temp;
+    Mat src(Size(width, height), src_type, const_cast<uchar*>(src_data), src_step);
+    Mat source = src;
+    if( src_data == dst_data )
+    {
+        src.copyTo(temp);
+        source = temp;
+    }
+    bool ok;
+    parallel_for_(Range(0, source.rows),
+                  CvtColorIPPLoop_Invoker<Cvt>(source.data, source.step, dst_data, dst_step,
+                                               source.cols, cvt, &ok),
+                  source.total()/(double)(1<<16) );
+    return ok;
+}
+
+
+struct IPPGeneralFunctor
+{
+    IPPGeneralFunctor(ippiGeneralFunc _func) : ippiColorConvertGeneral(_func){}
+    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
+    {
+        return ippiColorConvertGeneral ? CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, src, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0 : false;
+    }
+private:
+    ippiGeneralFunc ippiColorConvertGeneral;
+};
+
+
+struct IPPReorderFunctor
+{
+    IPPReorderFunctor(ippiReorderFunc _func, int _order0, int _order1, int _order2) : ippiColorConvertReorder(_func)
+    {
+        order[0] = _order0;
+        order[1] = _order1;
+        order[2] = _order2;
+        order[3] = 3;
+    }
+    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
+    {
+        return ippiColorConvertReorder ? CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, src, srcStep, dst, dstStep, ippiSize(cols, rows), order) >= 0 : false;
+    }
+private:
+    ippiReorderFunc ippiColorConvertReorder;
+    int order[4];
+};
+
+
+struct IPPReorderGeneralFunctor
+{
+    IPPReorderGeneralFunctor(ippiReorderFunc _func1, ippiGeneralFunc _func2, int _order0, int _order1, int _order2, int _depth) :
+        ippiColorConvertReorder(_func1), ippiColorConvertGeneral(_func2), depth(_depth)
+    {
+        order[0] = _order0;
+        order[1] = _order1;
+        order[2] = _order2;
+        order[3] = 3;
+    }
+    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
+    {
+        if (ippiColorConvertReorder == 0 || ippiColorConvertGeneral == 0)
+            return false;
+
+        Mat temp;
+        temp.create(rows, cols, CV_MAKETYPE(depth, 3));
+        if(CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows), order) < 0)
+            return false;
+        return CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows)) >= 0;
+    }
+private:
+    ippiReorderFunc ippiColorConvertReorder;
+    ippiGeneralFunc ippiColorConvertGeneral;
+    int order[4];
+    int depth;
+};
+
+
+struct IPPGeneralReorderFunctor
+{
+    IPPGeneralReorderFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _order0, int _order1, int _order2, int _depth) :
+        ippiColorConvertGeneral(_func1), ippiColorConvertReorder(_func2), depth(_depth)
+    {
+        order[0] = _order0;
+        order[1] = _order1;
+        order[2] = _order2;
+        order[3] = 3;
+    }
+    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
+    {
+        if (ippiColorConvertGeneral == 0 || ippiColorConvertReorder == 0)
+            return false;
+
+        Mat temp;
+        temp.create(rows, cols, CV_MAKETYPE(depth, 3));
+        if(CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0)
+            return false;
+        return CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0;
+    }
+private:
+    ippiGeneralFunc ippiColorConvertGeneral;
+    ippiReorderFunc ippiColorConvertReorder;
+    int order[4];
+    int depth;
+};
+
+extern ippiReorderFunc ippiSwapChannelsC3C4RTab[8];
+extern ippiReorderFunc ippiSwapChannelsC4C3RTab[8];
+extern ippiReorderFunc ippiSwapChannelsC3RTab[8];
+
+#endif
+
+#ifdef HAVE_OPENCL
+
+bool oclCvtColorBGR2Luv( InputArray _src, OutputArray _dst, int bidx, bool srgb );
+bool oclCvtColorBGR2Lab( InputArray _src, OutputArray _dst, int bidx, bool srgb );
+bool oclCvtColorLab2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool srgb);
+bool oclCvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool srgb);
+bool oclCvtColorBGR2XYZ( InputArray _src, OutputArray _dst, int bidx );
+bool oclCvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx );
+
+bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full );
+bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full );
+bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full );
+bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full );
+
+bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool reverse );
+bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits );
+bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int gbits );
+bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits );
+bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits );
+bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx );
+bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn );
+bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst );
+bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst );
+
+bool oclCvtColorBGR2YCrCb( InputArray _src, OutputArray _dst, int bidx);
+bool oclCvtcolorYCrCb2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx);
+bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx );
+bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx );
+
+bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx );
+bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx );
+bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx );
+bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx, int uidx );
+bool oclCvtColorYUV2Gray_420( InputArray _src, OutputArray _dst );
+
+#endif
+
+void cvtColorBGR2Lab( InputArray _src, OutputArray _dst, bool swapb, bool srgb);
+void cvtColorBGR2Luv( InputArray _src, OutputArray _dst, bool swapb, bool srgb);
+void cvtColorLab2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool srgb );
+void cvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool srgb );
+void cvtColorBGR2XYZ( InputArray _src, OutputArray _dst, bool swapb );
+void cvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb );
+
+void cvtColorBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, bool crcb);
+void cvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb);
+
+void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn);
+void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
+void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx );
+void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
+void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx);
+void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst );
+void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi );
+
+void cvtColorBGR2HLS( InputArray _src, OutputArray _dst, bool swapb, bool fullRange );
+void cvtColorBGR2HSV( InputArray _src, OutputArray _dst, bool swapb, bool fullRange );
+void cvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange);
+void cvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange);
+
+void cvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb);
+void cvtColorBGR25x5( InputArray _src, OutputArray _dst, bool swapb, int gbits);
+void cvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int gbits);
+void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb);
+void cvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn);
+void cvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits);
+void cvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits);
+void cvtColorRGBA2mRGBA(InputArray _src, OutputArray _dst);
+void cvtColormRGBA2RGBA(InputArray _src, OutputArray _dst);
+
+} //namespace cv
--- a/modules/imgproc/src/color_hsv.cpp
+++ b/modules/imgproc/src/color_hsv.cpp
--- a/modules/imgproc/src/color_lab.cpp
+++ b/modules/imgproc/src/color_lab.cpp
--- a/modules/imgproc/src/color_rgb.cpp
+++ b/modules/imgproc/src/color_rgb.cpp
--- a/modules/imgproc/src/color_yuv.cpp
+++ b/modules/imgproc/src/color_yuv.cpp
--- a/modules/imgproc/src/demosaicing.cpp
+++ b/modules/imgproc/src/demosaicing.cpp
@ -41,6 +41,50 @@
 //
 //M*/

+/********************************* COPYRIGHT NOTICE *******************************\
+  Original code for Bayer->BGR/RGB conversion is provided by Dirk Schaefer
+  from MD-Mathematische Dienste GmbH. Below is the copyright notice:
+
+    IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+    By downloading, copying, installing or using the software you agree
+    to this license. If you do not agree to this license, do not download,
+    install, copy or use the software.
+
+    Contributors License Agreement:
+
+      Copyright (c) 2002,
+      MD-Mathematische Dienste GmbH
+      Im Defdahl 5-10
+      44141 Dortmund
+      Germany
+      www.md-it.de
+
+    Redistribution and use in source and binary forms,
+    with or without modification, are permitted provided
+    that the following conditions are met:
+
+    Redistributions of source code must retain
+    the above copyright notice, this list of conditions and the following disclaimer.
+    Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+    The name of Contributor may not be used to endorse or promote products
+    derived from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+    THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE
+    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+    OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+    STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+    THE POSSIBILITY OF SUCH DAMAGE.
+\**********************************************************************************/
+
+
 #include "precomp.hpp"

 #include <limits>
--- a/modules/imgproc/src/opencl/color_hsv.cl
+++ b/modules/imgproc/src/opencl/color_hsv.cl
@ -0,0 +1,621 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**************************************PUBLICFUNC*************************************/
+
+#if depth == 0
+    #define DATA_TYPE uchar
+    #define MAX_NUM  255
+    #define HALF_MAX_NUM 128
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_uchar_sat(num)
+    #define DEPTH_0
+#elif depth == 2
+    #define DATA_TYPE ushort
+    #define MAX_NUM  65535
+    #define HALF_MAX_NUM 32768
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_ushort_sat(num)
+    #define DEPTH_2
+#elif depth == 5
+    #define DATA_TYPE float
+    #define MAX_NUM  1.0f
+    #define HALF_MAX_NUM 0.5f
+    #define COEFF_TYPE float
+    #define SAT_CAST(num) (num)
+    #define DEPTH_5
+#else
+    #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
+#endif
+
+#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
+
+enum
+{
+    hsv_shift  = 12
+};
+
+#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
+#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
+
+#ifndef hscale
+#define hscale 0
+#endif
+
+#ifndef hrange
+#define hrange 0
+#endif
+
+#if bidx == 0
+#define R_COMP z
+#define G_COMP y
+#define B_COMP x
+#else
+#define R_COMP x
+#define G_COMP y
+#define B_COMP z
+#endif
+
+//////////////////////////////////// RGB <-> HSV //////////////////////////////////////
+
+__constant int sector_data[][3] = { { 1, 3, 0 },
+                                    { 1, 0, 2 },
+                                    { 3, 0, 1 },
+                                    { 0, 2, 1 },
+                                    { 0, 1, 3 },
+                                    { 2, 1, 0 } };
+
+#ifdef DEPTH_0
+
+__kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,
+                      __global uchar* dst, int dst_step, int dst_offset,
+                      int rows, int cols,
+                      __constant int * sdiv_table, __constant int * hdiv_table)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                uchar4 src_pix = vload4(0, src + src_index);
+
+                int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
+                int h, s, v = b;
+                int vmin = b, diff;
+                int vr, vg;
+
+                v = max(v, g);
+                v = max(v, r);
+                vmin = min(vmin, g);
+                vmin = min(vmin, r);
+
+                diff = v - vmin;
+                vr = v == r ? -1 : 0;
+                vg = v == g ? -1 : 0;
+
+                s = mad24(diff, sdiv_table[v], (1 << (hsv_shift-1))) >> hsv_shift;
+                h = (vr & (g - b)) +
+                    (~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g))));
+                h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift;
+                h += h < 0 ? hrange : 0;
+
+                dst[dst_index] = convert_uchar_sat_rte(h);
+                dst[dst_index + 1] = (uchar)s;
+                dst[dst_index + 2] = (uchar)v;
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
+                      __global uchar* dst, int dst_step, int dst_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                uchar4 src_pix = vload4(0, src + src_index);
+
+                float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);
+                float b, g, r;
+
+                if (s != 0)
+                {
+                    float tab[4];
+                    int sector;
+                    h *= hscale;
+                    if( h < 0 )
+                        do h += 6; while( h < 0 );
+                    else if( h >= 6 )
+                        do h -= 6; while( h >= 6 );
+                    sector = convert_int_sat_rtn(h);
+                    h -= sector;
+                    if( (unsigned)sector >= 6u )
+                    {
+                        sector = 0;
+                        h = 0.f;
+                    }
+
+                    tab[0] = v;
+                    tab[1] = v*(1.f - s);
+                    tab[2] = v*(1.f - s*h);
+                    tab[3] = v*(1.f - s*(1.f - h));
+
+                    b = tab[sector_data[sector][0]];
+                    g = tab[sector_data[sector][1]];
+                    r = tab[sector_data[sector][2]];
+                }
+                else
+                    b = g = r = v;
+
+                dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
+                dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
+                dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
+#if dcn == 4
+                dst[dst_index + 3] = MAX_NUM;
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#elif defined DEPTH_5
+
+__kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset,
+                      __global uchar* dstptr, int dst_step, int dst_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+                float4 src_pix = vload4(0, src);
+
+                float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
+                float h, s, v;
+
+                float vmin, diff;
+
+                v = vmin = r;
+                if( v < g ) v = g;
+                if( v < b ) v = b;
+                if( vmin > g ) vmin = g;
+                if( vmin > b ) vmin = b;
+
+                diff = v - vmin;
+                s = diff/(float)(fabs(v) + FLT_EPSILON);
+                diff = (float)(60.f/(diff + FLT_EPSILON));
+                if( v == r )
+                    h = (g - b)*diff;
+                else if( v == g )
+                    h = fma(b - r, diff, 120.f);
+                else
+                    h = fma(r - g, diff, 240.f);
+
+                if( h < 0 )
+                    h += 360.f;
+
+                dst[0] = h*hscale;
+                dst[1] = s;
+                dst[2] = v;
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
+                      __global uchar* dstptr, int dst_step, int dst_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+                float4 src_pix = vload4(0, src);
+
+                float h = src_pix.x, s = src_pix.y, v = src_pix.z;
+                float b, g, r;
+
+                if (s != 0)
+                {
+                    float tab[4];
+                    int sector;
+                    h *= hscale;
+                    if(h < 0)
+                        do h += 6; while (h < 0);
+                    else if (h >= 6)
+                        do h -= 6; while (h >= 6);
+                    sector = convert_int_sat_rtn(h);
+                    h -= sector;
+                    if ((unsigned)sector >= 6u)
+                    {
+                        sector = 0;
+                        h = 0.f;
+                    }
+
+                    tab[0] = v;
+                    tab[1] = v*(1.f - s);
+                    tab[2] = v*(1.f - s*h);
+                    tab[3] = v*(1.f - s*(1.f - h));
+
+                    b = tab[sector_data[sector][0]];
+                    g = tab[sector_data[sector][1]];
+                    r = tab[sector_data[sector][2]];
+                }
+                else
+                    b = g = r = v;
+
+                dst[bidx] = b;
+                dst[1] = g;
+                dst[bidx^2] = r;
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#endif
+
+///////////////////////////////////// RGB <-> HLS //////////////////////////////////////
+
+#ifdef DEPTH_0
+
+__kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,
+                      __global uchar* dst, int dst_step, int dst_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                uchar4 src_pix = vload4(0, src + src_index);
+
+                float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);
+                float h = 0.f, s = 0.f, l;
+                float vmin, vmax, diff;
+
+                vmax = vmin = r;
+                if (vmax < g) vmax = g;
+                if (vmax < b) vmax = b;
+                if (vmin > g) vmin = g;
+                if (vmin > b) vmin = b;
+
+                diff = vmax - vmin;
+                l = (vmax + vmin)*0.5f;
+
+                if (diff > FLT_EPSILON)
+                {
+                    s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
+                    diff = 60.f/diff;
+
+                    if( vmax == r )
+                        h = (g - b)*diff;
+                    else if( vmax == g )
+                        h = fma(b - r, diff, 120.f);
+                    else
+                        h = fma(r - g, diff, 240.f);
+
+                    if( h < 0.f )
+                        h += 360.f;
+                }
+
+                dst[dst_index] = convert_uchar_sat_rte(h*hscale);
+                dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f);
+                dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f);
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
+                      __global uchar* dst, int dst_step, int dst_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                uchar4 src_pix = vload4(0, src + src_index);
+
+                float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);
+                float b, g, r;
+
+                if (s != 0)
+                {
+                    float tab[4];
+
+                    float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
+                    float p1 = 2*l - p2;
+
+                    h *= hscale;
+                    if( h < 0 )
+                        do h += 6; while( h < 0 );
+                    else if( h >= 6 )
+                        do h -= 6; while( h >= 6 );
+
+                    int sector = convert_int_sat_rtn(h);
+                    h -= sector;
+
+                    tab[0] = p2;
+                    tab[1] = p1;
+                    tab[2] = fma(p2 - p1, 1-h, p1);
+                    tab[3] = fma(p2 - p1, h, p1);
+
+                    b = tab[sector_data[sector][0]];
+                    g = tab[sector_data[sector][1]];
+                    r = tab[sector_data[sector][2]];
+                }
+                else
+                    b = g = r = l;
+
+                dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
+                dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
+                dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
+#if dcn == 4
+                dst[dst_index + 3] = MAX_NUM;
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#elif defined DEPTH_5
+
+__kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset,
+                      __global uchar* dstptr, int dst_step, int dst_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+                float4 src_pix = vload4(0, src);
+
+                float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
+                float h = 0.f, s = 0.f, l;
+                float vmin, vmax, diff;
+
+                vmax = vmin = r;
+                if (vmax < g) vmax = g;
+                if (vmax < b) vmax = b;
+                if (vmin > g) vmin = g;
+                if (vmin > b) vmin = b;
+
+                diff = vmax - vmin;
+                l = (vmax + vmin)*0.5f;
+
+                if (diff > FLT_EPSILON)
+                {
+                    s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
+                    diff = 60.f/diff;
+
+                    if( vmax == r )
+                        h = (g - b)*diff;
+                    else if( vmax == g )
+                        h = fma(b - r, diff, 120.f);
+                    else
+                        h = fma(r - g, diff, 240.f);
+
+                    if( h < 0.f ) h += 360.f;
+                }
+
+                dst[0] = h*hscale;
+                dst[1] = l;
+                dst[2] = s;
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset,
+                      __global uchar* dstptr, int dst_step, int dst_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+                float4 src_pix = vload4(0, src);
+
+                float h = src_pix.x, l = src_pix.y, s = src_pix.z;
+                float b, g, r;
+
+                if (s != 0)
+                {
+                    float tab[4];
+                    int sector;
+
+                    float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
+                    float p1 = 2*l - p2;
+
+                    h *= hscale;
+                    if( h < 0 )
+                        do h += 6; while( h < 0 );
+                    else if( h >= 6 )
+                        do h -= 6; while( h >= 6 );
+
+                    sector = convert_int_sat_rtn(h);
+                    h -= sector;
+
+                    tab[0] = p2;
+                    tab[1] = p1;
+                    tab[2] = fma(p2 - p1, 1-h, p1);
+                    tab[3] = fma(p2 - p1, h, p1);
+
+                    b = tab[sector_data[sector][0]];
+                    g = tab[sector_data[sector][1]];
+                    r = tab[sector_data[sector][2]];
+                }
+                else
+                    b = g = r = l;
+
+                dst[bidx] = b;
+                dst[1] = g;
+                dst[bidx^2] = r;
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#endif
--- a/modules/imgproc/src/opencl/color_lab.cl
+++ b/modules/imgproc/src/opencl/color_lab.cl
@ -0,0 +1,735 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if depth == 0
+    #define DATA_TYPE uchar
+    #define MAX_NUM  255
+    #define HALF_MAX_NUM 128
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_uchar_sat(num)
+    #define DEPTH_0
+#elif depth == 2
+    #define DATA_TYPE ushort
+    #define MAX_NUM  65535
+    #define HALF_MAX_NUM 32768
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_ushort_sat(num)
+    #define DEPTH_2
+#elif depth == 5
+    #define DATA_TYPE float
+    #define MAX_NUM  1.0f
+    #define HALF_MAX_NUM 0.5f
+    #define COEFF_TYPE float
+    #define SAT_CAST(num) (num)
+    #define DEPTH_5
+#else
+    #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
+#endif
+
+#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
+
+enum
+{
+    xyz_shift  = 12,
+};
+
+#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
+#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
+
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+
+#define DATA_TYPE_4 CAT(DATA_TYPE, 4)
+#define DATA_TYPE_3 CAT(DATA_TYPE, 3)
+
+///////////////////////////////////// RGB <-> XYZ //////////////////////////////////////
+
+__kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset,
+                      int rows, int cols, __constant COEFF_TYPE * coeffs)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (dx < cols)
+    {
+        int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
+        int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (dy < rows)
+            {
+                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
+                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
+
+                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;
+
+#ifdef DEPTH_5
+                float x = fma(r, coeffs[0], fma(g, coeffs[1], b * coeffs[2]));
+                float y = fma(r, coeffs[3], fma(g, coeffs[4], b * coeffs[5]));
+                float z = fma(r, coeffs[6], fma(g, coeffs[7], b * coeffs[8]));
+#else
+                int x = CV_DESCALE(mad24(r, coeffs[0], mad24(g, coeffs[1], b * coeffs[2])), xyz_shift);
+                int y = CV_DESCALE(mad24(r, coeffs[3], mad24(g, coeffs[4], b * coeffs[5])), xyz_shift);
+                int z = CV_DESCALE(mad24(r, coeffs[6], mad24(g, coeffs[7], b * coeffs[8])), xyz_shift);
+#endif
+                dst[0] = SAT_CAST(x);
+                dst[1] = SAT_CAST(y);
+                dst[2] = SAT_CAST(z);
+
+                ++dy;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset,
+                      int rows, int cols, __constant COEFF_TYPE * coeffs)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (dx < cols)
+    {
+        int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
+        int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (dy < rows)
+            {
+                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
+                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
+
+                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;
+
+#ifdef DEPTH_5
+                float b = fma(x, coeffs[0], fma(y, coeffs[1], z * coeffs[2]));
+                float g = fma(x, coeffs[3], fma(y, coeffs[4], z * coeffs[5]));
+                float r = fma(x, coeffs[6], fma(y, coeffs[7], z * coeffs[8]));
+#else
+                int b = CV_DESCALE(mad24(x, coeffs[0], mad24(y, coeffs[1], z * coeffs[2])), xyz_shift);
+                int g = CV_DESCALE(mad24(x, coeffs[3], mad24(y, coeffs[4], z * coeffs[5])), xyz_shift);
+                int r = CV_DESCALE(mad24(x, coeffs[6], mad24(y, coeffs[7], z * coeffs[8])), xyz_shift);
+#endif
+
+                DATA_TYPE dst0 = SAT_CAST(b);
+                DATA_TYPE dst1 = SAT_CAST(g);
+                DATA_TYPE dst2 = SAT_CAST(r);
+#if dcn == 3 || defined DEPTH_5
+                dst[0] = dst0;
+                dst[1] = dst1;
+                dst[2] = dst2;
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+#else
+                *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(dst0, dst1, dst2, MAX_NUM);
+#endif
+
+                ++dy;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+/////////////////////////////////// [l|s]RGB <-> Lab ///////////////////////////
+
+#define lab_shift xyz_shift
+#define gamma_shift 3
+#define lab_shift2 (lab_shift + gamma_shift)
+#define GAMMA_TAB_SIZE 1024
+#define GammaTabScale (float)GAMMA_TAB_SIZE
+
+inline float splineInterpolate(float x, __global const float * tab, int n)
+{
+    int ix = clamp(convert_int_sat_rtn(x), 0, n-1);
+    x -= ix;
+    tab += ix << 2;
+    return fma(fma(fma(tab[3], x, tab[2]), x, tab[1]), x, tab[0]);
+}
+
+#ifdef DEPTH_0
+
+__kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,
+                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
+                      __global const ushort * gammaTab, __global ushort * LabCbrtTab_b,
+                      __constant int * coeffs, int Lscale, int Lshift)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const uchar* src_ptr = src + src_index;
+                __global uchar* dst_ptr = dst + dst_index;
+                uchar4 src_pix = vload4(0, src_ptr);
+
+                int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+                    C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+                    C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+                int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];
+                int fX = LabCbrtTab_b[CV_DESCALE(mad24(R, C0, mad24(G, C1, B*C2)), lab_shift)];
+                int fY = LabCbrtTab_b[CV_DESCALE(mad24(R, C3, mad24(G, C4, B*C5)), lab_shift)];
+                int fZ = LabCbrtTab_b[CV_DESCALE(mad24(R, C6, mad24(G, C7, B*C8)), lab_shift)];
+
+                int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
+                int a = CV_DESCALE( mad24(500, fX - fY, 128*(1 << lab_shift2)), lab_shift2 );
+                int b = CV_DESCALE( mad24(200, fY - fZ, 128*(1 << lab_shift2)), lab_shift2 );
+
+                dst_ptr[0] = SAT_CAST(L);
+                dst_ptr[1] = SAT_CAST(a);
+                dst_ptr[2] = SAT_CAST(b);
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#elif defined DEPTH_5
+
+__kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float _1_3, float _a)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+                float4 src_pix = vload4(0, src);
+
+                float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+                      C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+                      C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+                float R = clamp(src_pix.x, 0.0f, 1.0f);
+                float G = clamp(src_pix.y, 0.0f, 1.0f);
+                float B = clamp(src_pix.z, 0.0f, 1.0f);
+
+#ifdef SRGB
+                R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+
+                // 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
+                float X = fma(R, C0, fma(G, C1, B*C2));
+                float Y = fma(R, C3, fma(G, C4, B*C5));
+                float Z = fma(R, C6, fma(G, C7, B*C8));
+
+                float FX = X > 0.008856f ? rootn(X, 3) : fma(7.787f, X, _a);
+                float FY = Y > 0.008856f ? rootn(Y, 3) : fma(7.787f, Y, _a);
+                float FZ = Z > 0.008856f ? rootn(Z, 3) : fma(7.787f, Z, _a);
+
+                float L = Y > 0.008856f ? fma(116.f, FY, -16.f) : (903.3f * Y);
+                float a = 500.f * (FX - FY);
+                float b = 200.f * (FY - FZ);
+
+                dst[0] = L;
+                dst[1] = a;
+                dst[2] = b;
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#endif
+
+inline void Lab2BGR_f(const float * srcbuf, float * dstbuf,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float lThresh, float fThresh)
+{
+    float li = srcbuf[0], ai = srcbuf[1], bi = srcbuf[2];
+
+    float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+          C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+          C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+    float y, fy;
+    // 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4)
+    if (li <= lThresh)
+    {
+        y = li / 903.3f;
+        fy = fma(7.787f, y, 16.0f / 116.0f);
+    }
+    else
+    {
+        fy = (li + 16.0f) / 116.0f;
+        y = fy * fy * fy;
+    }
+
+    float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
+
+    #pragma unroll
+    for (int j = 0; j < 2; j++)
+        if (fxz[j] <= fThresh)
+            fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
+        else
+            fxz[j] = fxz[j] * fxz[j] * fxz[j];
+
+    float x = fxz[0], z = fxz[1];
+    float ro = clamp(fma(C0, x, fma(C1, y, C2 * z)), 0.0f, 1.0f);
+    float go = clamp(fma(C3, x, fma(C4, y, C5 * z)), 0.0f, 1.0f);
+    float bo = clamp(fma(C6, x, fma(C7, y, C8 * z)), 0.0f, 1.0f);
+
+#ifdef SRGB
+    ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+    go = splineInterpolate(go * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+    bo = splineInterpolate(bo * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+
+    dstbuf[0] = ro, dstbuf[1] = go, dstbuf[2] = bo;
+}
+
+#ifdef DEPTH_0
+
+__kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
+                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float lThresh, float fThresh)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const uchar* src_ptr = src + src_index;
+                __global uchar * dst_ptr = dst + dst_index;
+                uchar4 src_pix = vload4(0, src_ptr);
+
+                float srcbuf[3], dstbuf[3];
+                srcbuf[0] = src_pix.x*(100.f/255.f);
+                srcbuf[1] = convert_float(src_pix.y - 128);
+                srcbuf[2] = convert_float(src_pix.z - 128);
+
+                Lab2BGR_f(&srcbuf[0], &dstbuf[0],
+#ifdef SRGB
+                    gammaTab,
+#endif
+                    coeffs, lThresh, fThresh);
+
+#if dcn == 3
+                dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
+                dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
+                dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
+#else
+                *(__global uchar4 *)dst_ptr = (uchar4)(SAT_CAST(dstbuf[0] * 255.0f),
+                    SAT_CAST(dstbuf[1] * 255.0f), SAT_CAST(dstbuf[2] * 255.0f), MAX_NUM);
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#elif defined DEPTH_5
+
+__kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float lThresh, float fThresh)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+                float4 src_pix = vload4(0, src);
+
+                float srcbuf[3], dstbuf[3];
+                srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;
+
+                Lab2BGR_f(&srcbuf[0], &dstbuf[0],
+#ifdef SRGB
+                    gammaTab,
+#endif
+                    coeffs, lThresh, fThresh);
+
+                dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#endif
+
+/////////////////////////////////// [l|s]RGB <-> Luv ///////////////////////////
+
+#define LAB_CBRT_TAB_SIZE 1024
+#define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
+
+__constant float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
+
+#ifdef DEPTH_5
+
+__kernel void BGR2Luv(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+
+                float R = src[0], G = src[1], B = src[2];
+
+                R = clamp(R, 0.f, 1.f);
+                G = clamp(G, 0.f, 1.f);
+                B = clamp(B, 0.f, 1.f);
+
+#ifdef SRGB
+                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+                float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
+                float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
+                float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
+
+                float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                L = fma(116.f, L, -16.f);
+
+                float d = 52.0f / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
+                float u = L*fma(X, d, -_un);
+                float v = L*fma(2.25f, Y*d, -_vn);
+
+                dst[0] = L;
+                dst[1] = u;
+                dst[2] = v;
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+    }
+}
+
+#elif defined DEPTH_0
+
+__kernel void BGR2Luv(__global const uchar * src, int src_step, int src_offset,
+                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+            if (y < rows)
+            {
+                float scale = 1.0f / 255.0f;
+                float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale;
+
+#ifdef SRGB
+                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+                float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
+                float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
+                float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
+
+                float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                L = 116.f*L - 16.f;
+
+                float d = (4*13) / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
+                float u = L*(X*d - _un);
+                float v = L*fma(2.25f, Y*d, -_vn);
+
+                dst[0] = SAT_CAST(L * 2.55f);
+                //0.72033 = 255/(220+134), 96.525 = 134*255/(220+134)
+                dst[1] = SAT_CAST(fma(u, 0.72033898305084743f, 96.525423728813564f));
+                //0.9732 = 255/(140+122), 136.259 = 140*255/(140+122)
+                dst[2] = SAT_CAST(fma(v, 0.9732824427480916f, 136.259541984732824f));
+
+                ++y;
+                dst += dst_step;
+                src += src_step;
+            }
+    }
+}
+
+#endif
+
+#ifdef DEPTH_5
+
+__kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float _un, float _vn)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+
+                float L = src[0], u = src[1], v = src[2], X, Y, Z;
+                if(L >= 8)
+                {
+                    Y = fma(L, 1.f/116.f, 16.f/116.f);
+                    Y = Y*Y*Y;
+                }
+                else
+                {
+                    Y = L * (1.0f/903.3f); // L*(3./29.)^3
+                }
+                float up = 3.f*fma(L, _un, u);
+                float vp = 0.25f/fma(L, _vn, v);
+                vp = clamp(vp, -0.25f, 0.25f);
+                X = 3.f*Y*up*vp;
+                Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f);
+
+                float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
+                float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
+                float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
+
+                R = clamp(R, 0.f, 1.f);
+                G = clamp(G, 0.f, 1.f);
+                B = clamp(B, 0.f, 1.f);
+
+#ifdef SRGB
+                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+
+                dst[0] = R;
+                dst[1] = G;
+                dst[2] = B;
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+    }
+}
+
+#elif defined DEPTH_0
+
+__kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset,
+                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float _un, float _vn)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+            if (y < rows)
+            {
+                float d, X, Y, Z;
+                float L = src[0]*(100.f/255.f);
+                // 1.388235294117647 = (220+134)/255
+                float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f);
+                // 1.027450980392157 = (140+122)/255
+                float v = fma(convert_float(src[2]), 1.027450980392157f, - 140.f);
+                if(L >= 8)
+                {
+                    Y = fma(L, 1.f/116.f, 16.f/116.f);
+                    Y = Y*Y*Y;
+                }
+                else
+                {
+                    Y = L * (1.0f/903.3f); // L*(3./29.)^3
+                }
+                float up = 3.f*fma(L, _un, u);
+                float vp = 0.25f/fma(L, _vn, v);
+                vp = clamp(vp, -0.25f, 0.25f);
+                X = 3.f*Y*up*vp;
+                Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f);
+
+                //limit X, Y, Z to [0, 2] to fit white point
+                X = clamp(X, 0.f, 2.f); Z = clamp(Z, 0.f, 2.f);
+
+                float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
+                float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
+                float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
+
+                R = clamp(R, 0.f, 1.f);
+                G = clamp(G, 0.f, 1.f);
+                B = clamp(B, 0.f, 1.f);
+
+#ifdef SRGB
+                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+
+                uchar dst0 = SAT_CAST(R * 255.0f);
+                uchar dst1 = SAT_CAST(G * 255.0f);
+                uchar dst2 = SAT_CAST(B * 255.0f);
+
+#if dcn == 4
+                *(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM);
+#else
+                dst[0] = dst0;
+                dst[1] = dst1;
+                dst[2] = dst2;
+#endif
+
+                ++y;
+                dst += dst_step;
+                src += src_step;
+            }
+    }
+}
+
+#endif
--- a/modules/imgproc/src/opencl/color_rgb.cl
+++ b/modules/imgproc/src/opencl/color_rgb.cl
@ -0,0 +1,454 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**************************************PUBLICFUNC*************************************/
+
+#if depth == 0
+    #define DATA_TYPE uchar
+    #define MAX_NUM  255
+    #define HALF_MAX_NUM 128
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_uchar_sat(num)
+    #define DEPTH_0
+#elif depth == 2
+    #define DATA_TYPE ushort
+    #define MAX_NUM  65535
+    #define HALF_MAX_NUM 32768
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_ushort_sat(num)
+    #define DEPTH_2
+#elif depth == 5
+    #define DATA_TYPE float
+    #define MAX_NUM  1.0f
+    #define HALF_MAX_NUM 0.5f
+    #define COEFF_TYPE float
+    #define SAT_CAST(num) (num)
+    #define DEPTH_5
+#else
+    #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
+#endif
+
+#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
+
+enum
+{
+    yuv_shift  = 14,
+    R2Y        = 4899,
+    G2Y        = 9617,
+    B2Y        = 1868
+};
+
+//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
+#define B2YF 0.114f
+#define G2YF 0.587f
+#define R2YF 0.299f
+
+#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
+#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
+
+#if bidx == 0
+#define R_COMP z
+#define G_COMP y
+#define B_COMP x
+#else
+#define R_COMP x
+#define G_COMP y
+#define B_COMP z
+#endif
+
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+
+#define DATA_TYPE_4 CAT(DATA_TYPE, 4)
+#define DATA_TYPE_3 CAT(DATA_TYPE, 3)
+
+///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
+
+__kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offset,
+                       __global uchar * dstptr, int dst_step, int dst_offset,
+                       int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
+                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
+                DATA_TYPE_3 src_pix = vload3(0, src);
+#ifdef DEPTH_5
+                dst[0] = fma(src_pix.B_COMP, B2YF, fma(src_pix.G_COMP, G2YF, src_pix.R_COMP * R2YF));
+#else
+                dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, mul24(src_pix.R_COMP, R2Y))), yuv_shift);
+#endif
+                ++y;
+                src_index += src_step;
+                dst_index += dst_step;
+            }
+        }
+    }
+}
+
+__kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offset,
+                       __global uchar * dstptr, int dst_step, int dst_offset,
+                       int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
+                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
+                DATA_TYPE val = src[0];
+#if dcn == 3 || defined DEPTH_5
+                dst[0] = dst[1] = dst[2] = val;
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+#else
+                *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(val, val, val, MAX_NUM);
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+///////////////////////////////////// RGB[A] <-> BGR[A] //////////////////////////////////////
+
+__kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
+                  __global uchar* dstptr, int dst_step, int dst_offset,
+                  int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
+                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
+#if scn == 3
+                DATA_TYPE_3 src_pix = vload3(0, src);
+#else
+                DATA_TYPE_4 src_pix = vload4(0, src);
+#endif
+
+#ifdef REVERSE
+                dst[0] = src_pix.z;
+                dst[1] = src_pix.y;
+                dst[2] = src_pix.x;
+#else
+                dst[0] = src_pix.x;
+                dst[1] = src_pix.y;
+                dst[2] = src_pix.z;
+#endif
+
+#if dcn == 4
+#if scn == 3
+                dst[3] = MAX_NUM;
+#else
+                dst[3] = src[3];
+#endif
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////
+
+__kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset,
+                         __global uchar* dst, int dst_step, int dst_offset,
+                         int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                ushort t = *((__global const ushort*)(src + src_index));
+
+#if greenbits == 6
+                dst[dst_index + bidx] = (uchar)(t << 3);
+                dst[dst_index + 1] = (uchar)((t >> 3) & ~3);
+                dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7);
+#else
+                dst[dst_index + bidx] = (uchar)(t << 3);
+                dst[dst_index + 1] = (uchar)((t >> 2) & ~7);
+                dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7);
+#endif
+
+#if dcn == 4
+#if greenbits == 6
+                dst[dst_index + 3] = 255;
+#else
+                dst[dst_index + 3] = t & 0x8000 ? 255 : 0;
+#endif
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset,
+                         __global uchar* dst, int dst_step, int dst_offset,
+                         int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                uchar4 src_pix = vload4(0, src + src_index);
+
+#if greenbits == 6
+                    *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
+#elif scn == 3
+                    *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
+#else
+                    *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
+                        ((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+///////////////////////////////////// RGB5x5 <-> Gray //////////////////////////////////////
+
+__kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offset,
+                          __global uchar* dst, int dst_step, int dst_offset,
+                          int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, dst_offset + x);
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int t = *((__global const ushort*)(src + src_index));
+
+#if greenbits == 6
+                dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 3) & 0xfc, G2Y, ((t >> 8) & 0xf8) * R2Y)), yuv_shift);
+#else
+                dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 2) & 0xf8, G2Y, ((t >> 7) & 0xf8) * R2Y)), yuv_shift);
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offset,
+                          __global uchar* dst, int dst_step, int dst_offset,
+                          int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, src_offset + x);
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int t = src[src_index];
+
+#if greenbits == 6
+                *((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
+#else
+                t >>= 3;
+                *((__global ushort*)(dst + dst_index)) = (ushort)(t|(t << 5)|(t << 10));
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
+
+#ifdef DEPTH_0
+
+__kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset,
+                         __global uchar* dst, int dst_step, int dst_offset,
+                         int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, src_offset + (x << 2));
+        int dst_index = mad24(y, dst_step, dst_offset + (x << 2));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
+
+                *(__global uchar4 *)(dst + dst_index) =
+                    (uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX_NUM) / MAX_NUM,
+                             mad24(src_pix.y, src_pix.w, HALF_MAX_NUM) / MAX_NUM,
+                             mad24(src_pix.z, src_pix.w, HALF_MAX_NUM) / MAX_NUM, src_pix.w);
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset,
+                         __global uchar* dst, int dst_step, int dst_offset,
+                         int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, 4, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, 4, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
+                uchar v3 = src_pix.w, v3_half = v3 / 2;
+
+                if (v3 == 0)
+                    *(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0);
+                else
+                    *(__global uchar4 *)(dst + dst_index) =
+                        (uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3,
+                                 mad24(src_pix.y, MAX_NUM, v3_half) / v3,
+                                 mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3);
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#endif
--- a/modules/imgproc/src/opencl/color_yuv.cl
+++ b/modules/imgproc/src/opencl/color_yuv.cl
@ -0,0 +1,674 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**************************************PUBLICFUNC*************************************/
+
+#if depth == 0
+    #define DATA_TYPE uchar
+    #define MAX_NUM  255
+    #define HALF_MAX_NUM 128
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_uchar_sat(num)
+    #define DEPTH_0
+#elif depth == 2
+    #define DATA_TYPE ushort
+    #define MAX_NUM  65535
+    #define HALF_MAX_NUM 32768
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_ushort_sat(num)
+    #define DEPTH_2
+#elif depth == 5
+    #define DATA_TYPE float
+    #define MAX_NUM  1.0f
+    #define HALF_MAX_NUM 0.5f
+    #define COEFF_TYPE float
+    #define SAT_CAST(num) (num)
+    #define DEPTH_5
+#else
+    #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
+#endif
+
+#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
+
+enum
+{
+    yuv_shift  = 14,
+    R2Y        = 4899,
+    G2Y        = 9617,
+    B2Y        = 1868,
+};
+
+//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
+#define B2YF 0.114f
+#define G2YF 0.587f
+#define R2YF 0.299f
+//to YCbCr
+#define YCBF 0.564f
+#define YCRF 0.713f
+#define YCBI 9241
+#define YCRI 11682
+//to YUV
+#define B2UF 0.492f
+#define R2VF 0.877f
+#define B2UI 8061
+#define R2VI 14369
+//from YUV
+#define U2BF 2.032f
+#define U2GF -0.395f
+#define V2GF -0.581f
+#define V2RF 1.140f
+#define U2BI 33292
+#define U2GI -6472
+#define V2GI -9519
+#define V2RI 18678
+//from YCrCb
+#define CR2RF 1.403f
+#define CB2GF -0.344f
+#define CR2GF -0.714f
+#define CB2BF 1.773f
+#define CR2RI 22987
+#define CB2GI -5636
+#define CR2GI -11698
+#define CB2BI 29049
+
+#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
+#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
+
+#if bidx == 0
+#define R_COMP z
+#define G_COMP y
+#define B_COMP x
+#else
+#define R_COMP x
+#define G_COMP y
+#define B_COMP z
+#endif
+
+#ifndef uidx
+#define uidx 0
+#endif
+
+#ifndef yidx
+#define yidx 0
+#endif
+
+#ifndef PIX_PER_WI_X
+#define PIX_PER_WI_X 1
+#endif
+
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+
+#define DATA_TYPE_4 CAT(DATA_TYPE, 4)
+#define DATA_TYPE_3 CAT(DATA_TYPE, 3)
+
+///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
+
+__constant float c_RGB2YUVCoeffs_f[5]  = { B2YF, G2YF, R2YF, B2UF, R2VF };
+__constant int   c_RGB2YUVCoeffs_i[5]  = { B2Y, G2Y, R2Y, B2UI, R2VI };
+
+__kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset,
+                      __global uchar* dstptr, int dst_step, int dt_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
+                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
+                DATA_TYPE_3 src_pix = vload3(0, src);
+                DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
+
+#ifdef DEPTH_5
+                __constant float * coeffs = c_RGB2YUVCoeffs_f;
+                const DATA_TYPE Y = fma(b, coeffs[0], fma(g, coeffs[1], r * coeffs[2]));
+                const DATA_TYPE U = fma(b - Y, coeffs[3], HALF_MAX_NUM);
+                const DATA_TYPE V = fma(r - Y, coeffs[4], HALF_MAX_NUM);
+#else
+                __constant int * coeffs = c_RGB2YUVCoeffs_i;
+                const int delta = HALF_MAX_NUM * (1 << yuv_shift);
+                const int Y = CV_DESCALE(mad24(b, coeffs[0], mad24(g, coeffs[1], mul24(r, coeffs[2]))), yuv_shift);
+                const int U = CV_DESCALE(mad24(b - Y, coeffs[3], delta), yuv_shift);
+                const int V = CV_DESCALE(mad24(r - Y, coeffs[4], delta), yuv_shift);
+#endif
+
+                dst[0] = SAT_CAST( Y );
+                dst[1] = SAT_CAST( U );
+                dst[2] = SAT_CAST( V );
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__constant float c_YUV2RGBCoeffs_f[4] = { U2BF, U2GF, V2GF, V2RF };
+__constant int   c_YUV2RGBCoeffs_i[4] = { U2BI, U2GI, V2GI, V2RI };
+
+__kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
+                      __global uchar* dstptr, int dst_step, int dt_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
+                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
+                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE Y = src_pix.x, U = src_pix.y, V = src_pix.z;
+
+#ifdef DEPTH_5
+                __constant float * coeffs = c_YUV2RGBCoeffs_f;
+                float r = fma(V - HALF_MAX_NUM, coeffs[3], Y);
+                float g = fma(V - HALF_MAX_NUM, coeffs[2], fma(U - HALF_MAX_NUM, coeffs[1], Y));
+                float b = fma(U - HALF_MAX_NUM, coeffs[0], Y);
+#else
+                __constant int * coeffs = c_YUV2RGBCoeffs_i;
+                const int r = Y + CV_DESCALE(mul24(V - HALF_MAX_NUM, coeffs[3]), yuv_shift);
+                const int g = Y + CV_DESCALE(mad24(V - HALF_MAX_NUM, coeffs[2], mul24(U - HALF_MAX_NUM, coeffs[1])), yuv_shift);
+                const int b = Y + CV_DESCALE(mul24(U - HALF_MAX_NUM, coeffs[0]), yuv_shift);
+#endif
+
+                dst[bidx] = SAT_CAST( b );
+                dst[1] = SAT_CAST( g );
+                dst[bidx^2] = SAT_CAST( r );
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+__constant float c_YUV2RGBCoeffs_420[5] = { 1.163999557f, 2.017999649f, -0.390999794f,
+                                            -0.812999725f, 1.5959997177f };
+
+__kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_offset,
+                            __global uchar* dstptr, int dst_step, int dt_offset,
+                            int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols / 2)
+    {
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows / 2 )
+            {
+                __global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
+                __global const uchar* usrc = srcptr + mad24(rows + y, src_step, (x << 1) + src_offset);
+                __global uchar*       dst1 = dstptr + mad24(y << 1, dst_step, mad24(x, dcn<<1, dt_offset));
+                __global uchar*       dst2 = dst1 + dst_step;
+
+                float Y1 = ysrc[0];
+                float Y2 = ysrc[1];
+                float Y3 = ysrc[src_step];
+                float Y4 = ysrc[src_step + 1];
+
+                float U  = ((float)usrc[uidx]) - HALF_MAX_NUM;
+                float V  = ((float)usrc[1-uidx]) - HALF_MAX_NUM;
+
+                __constant float* coeffs = c_YUV2RGBCoeffs_420;
+                float ruv = fma(coeffs[4], V, 0.5f);
+                float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
+                float buv = fma(coeffs[1], U, 0.5f);
+
+                Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
+                dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
+                dst1[1]        = convert_uchar_sat(Y1 + guv);
+                dst1[bidx]     = convert_uchar_sat(Y1 + buv);
+#if dcn == 4
+                dst1[3]        = 255;
+#endif
+
+                Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
+                dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
+                dst1[dcn + 1]        = convert_uchar_sat(Y2 + guv);
+                dst1[dcn + bidx]     = convert_uchar_sat(Y2 + buv);
+#if dcn == 4
+                dst1[7]        = 255;
+#endif
+
+                Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
+                dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
+                dst2[1]        = convert_uchar_sat(Y3 + guv);
+                dst2[bidx]     = convert_uchar_sat(Y3 + buv);
+#if dcn == 4
+                dst2[3]        = 255;
+#endif
+
+                Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
+                dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
+                dst2[dcn + 1]        = convert_uchar_sat(Y4 + guv);
+                dst2[dcn + bidx]     = convert_uchar_sat(Y4 + buv);
+#if dcn == 4
+                dst2[7]        = 255;
+#endif
+            }
+            ++y;
+        }
+    }
+}
+
+#if uidx < 2
+
+__kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
+                                __global uchar* dstptr, int dst_step, int dt_offset,
+                                int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols / 2)
+    {
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows / 2 )
+            {
+                __global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
+                __global uchar*       dst1 = dstptr + mad24(y << 1, dst_step, x * (dcn<<1) + dt_offset);
+                __global uchar*       dst2 = dst1 + dst_step;
+
+                float Y1 = ysrc[0];
+                float Y2 = ysrc[1];
+                float Y3 = ysrc[src_step];
+                float Y4 = ysrc[src_step + 1];
+
+#ifdef SRC_CONT
+                __global const uchar* uvsrc = srcptr + mad24(rows, src_step, src_offset);
+                int u_ind = mad24(y, cols >> 1, x);
+                float uv[2] = { ((float)uvsrc[u_ind]) - HALF_MAX_NUM, ((float)uvsrc[u_ind + ((rows * cols) >> 2)]) - HALF_MAX_NUM };
+#else
+                int vsteps[2] = { cols >> 1, src_step - (cols >> 1)};
+                __global const uchar* usrc = srcptr + mad24(rows + (y>>1), src_step, src_offset + (y%2)*(cols >> 1) + x);
+                __global const uchar* vsrc = usrc + mad24(rows >> 2, src_step, rows % 4 ? vsteps[y%2] : 0);
+                float uv[2] = { ((float)usrc[0]) - HALF_MAX_NUM, ((float)vsrc[0]) - HALF_MAX_NUM };
+#endif
+                float U = uv[uidx];
+                float V = uv[1-uidx];
+
+                __constant float* coeffs = c_YUV2RGBCoeffs_420;
+                float ruv = fma(coeffs[4], V, 0.5f);
+                float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
+                float buv = fma(coeffs[1], U, 0.5f);
+
+                Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
+                dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
+                dst1[1]        = convert_uchar_sat(Y1 + guv);
+                dst1[bidx]     = convert_uchar_sat(Y1 + buv);
+#if dcn == 4
+                dst1[3]        = 255;
+#endif
+
+                Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
+                dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
+                dst1[dcn + 1]        = convert_uchar_sat(Y2 + guv);
+                dst1[dcn + bidx]     = convert_uchar_sat(Y2 + buv);
+#if dcn == 4
+                dst1[7]        = 255;
+#endif
+
+                Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
+                dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
+                dst2[1]        = convert_uchar_sat(Y3 + guv);
+                dst2[bidx]     = convert_uchar_sat(Y3 + buv);
+#if dcn == 4
+                dst2[3]        = 255;
+#endif
+
+                Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
+                dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
+                dst2[dcn + 1]        = convert_uchar_sat(Y4 + guv);
+                dst2[dcn + bidx]     = convert_uchar_sat(Y4 + buv);
+#if dcn == 4
+                dst2[7]        = 255;
+#endif
+            }
+            ++y;
+        }
+    }
+}
+
+#endif
+
+#if uidx < 2
+
+__constant float c_RGB2YUVCoeffs_420[8] = { 0.256999969f, 0.50399971f, 0.09799957f, -0.1479988098f, -0.2909994125f,
+                                            0.438999176f, -0.3679990768f, -0.0709991455f };
+
+__kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
+                                __global uchar* dstptr, int dst_step, int dst_offset,
+                                int rows, int cols)
+{
+    int x = get_global_id(0) * PIX_PER_WI_X;
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols/2)
+    {
+        int src_index  = mad24(y << 1, src_step, mad24(x << 1, scn, src_offset));
+        int ydst_index = mad24(y << 1, dst_step, (x << 1) + dst_offset);
+        int y_rows = rows / 3 * 2;
+        int vsteps[2] = { cols >> 1, dst_step - (cols >> 1)};
+        __constant float* coeffs = c_RGB2YUVCoeffs_420;
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows / 3)
+            {
+                __global const uchar* src1 = srcptr + src_index;
+                __global const uchar* src2 = src1 + src_step;
+                __global uchar* ydst1 = dstptr + ydst_index;
+                __global uchar* ydst2 = ydst1 + dst_step;
+
+                __global uchar* udst = dstptr + mad24(y_rows + (y>>1), dst_step, dst_offset + (y%2)*(cols >> 1) + x);
+                __global uchar* vdst = udst + mad24(y_rows >> 2, dst_step, y_rows % 4 ? vsteps[y%2] : 0);
+
+#if PIX_PER_WI_X == 2
+                int s11 = *((__global const int*) src1);
+                int s12 = *((__global const int*) src1 + 1);
+                int s13 = *((__global const int*) src1 + 2);
+#if scn == 4
+                int s14 = *((__global const int*) src1 + 3);
+#endif
+                int s21 = *((__global const int*) src2);
+                int s22 = *((__global const int*) src2 + 1);
+                int s23 = *((__global const int*) src2 + 2);
+#if scn == 4
+                int s24 = *((__global const int*) src2 + 3);
+#endif
+                float src_pix1[scn * 4], src_pix2[scn * 4];
+
+                *((float4*) src_pix1)     = convert_float4(as_uchar4(s11));
+                *((float4*) src_pix1 + 1) = convert_float4(as_uchar4(s12));
+                *((float4*) src_pix1 + 2) = convert_float4(as_uchar4(s13));
+#if scn == 4
+                *((float4*) src_pix1 + 3) = convert_float4(as_uchar4(s14));
+#endif
+                *((float4*) src_pix2)     = convert_float4(as_uchar4(s21));
+                *((float4*) src_pix2 + 1) = convert_float4(as_uchar4(s22));
+                *((float4*) src_pix2 + 2) = convert_float4(as_uchar4(s23));
+#if scn == 4
+                *((float4*) src_pix2 + 3) = convert_float4(as_uchar4(s24));
+#endif
+                uchar4 y1, y2;
+                y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[      2-bidx], fma(coeffs[1], src_pix1[      1], fma(coeffs[2], src_pix1[      bidx], 16.5f))));
+                y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[  scn+2-bidx], fma(coeffs[1], src_pix1[  scn+1], fma(coeffs[2], src_pix1[  scn+bidx], 16.5f))));
+                y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*scn+2-bidx], fma(coeffs[1], src_pix1[2*scn+1], fma(coeffs[2], src_pix1[2*scn+bidx], 16.5f))));
+                y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*scn+2-bidx], fma(coeffs[1], src_pix1[3*scn+1], fma(coeffs[2], src_pix1[3*scn+bidx], 16.5f))));
+                y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[      2-bidx], fma(coeffs[1], src_pix2[      1], fma(coeffs[2], src_pix2[      bidx], 16.5f))));
+                y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[  scn+2-bidx], fma(coeffs[1], src_pix2[  scn+1], fma(coeffs[2], src_pix2[  scn+bidx], 16.5f))));
+                y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*scn+2-bidx], fma(coeffs[1], src_pix2[2*scn+1], fma(coeffs[2], src_pix2[2*scn+bidx], 16.5f))));
+                y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*scn+2-bidx], fma(coeffs[1], src_pix2[3*scn+1], fma(coeffs[2], src_pix2[3*scn+bidx], 16.5f))));
+
+                *((__global int*) ydst1) = as_int(y1);
+                *((__global int*) ydst2) = as_int(y2);
+
+                float uv[4] = { fma(coeffs[3], src_pix1[      2-bidx], fma(coeffs[4], src_pix1[      1], fma(coeffs[5], src_pix1[      bidx], 128.5f))),
+                                fma(coeffs[5], src_pix1[      2-bidx], fma(coeffs[6], src_pix1[      1], fma(coeffs[7], src_pix1[      bidx], 128.5f))),
+                                fma(coeffs[3], src_pix1[2*scn+2-bidx], fma(coeffs[4], src_pix1[2*scn+1], fma(coeffs[5], src_pix1[2*scn+bidx], 128.5f))),
+                                fma(coeffs[5], src_pix1[2*scn+2-bidx], fma(coeffs[6], src_pix1[2*scn+1], fma(coeffs[7], src_pix1[2*scn+bidx], 128.5f))) };
+
+                udst[0] = convert_uchar_sat(uv[uidx]    );
+                vdst[0] = convert_uchar_sat(uv[1 - uidx]);
+                udst[1] = convert_uchar_sat(uv[2 + uidx]);
+                vdst[1] = convert_uchar_sat(uv[3 - uidx]);
+#else
+                float4 src_pix1 = convert_float4(vload4(0, src1));
+                float4 src_pix2 = convert_float4(vload4(0, src1+scn));
+                float4 src_pix3 = convert_float4(vload4(0, src2));
+                float4 src_pix4 = convert_float4(vload4(0, src2+scn));
+
+                ydst1[0] = convert_uchar_sat(fma(coeffs[0], src_pix1.R_COMP, fma(coeffs[1], src_pix1.G_COMP, fma(coeffs[2], src_pix1.B_COMP, 16.5f))));
+                ydst1[1] = convert_uchar_sat(fma(coeffs[0], src_pix2.R_COMP, fma(coeffs[1], src_pix2.G_COMP, fma(coeffs[2], src_pix2.B_COMP, 16.5f))));
+                ydst2[0] = convert_uchar_sat(fma(coeffs[0], src_pix3.R_COMP, fma(coeffs[1], src_pix3.G_COMP, fma(coeffs[2], src_pix3.B_COMP, 16.5f))));
+                ydst2[1] = convert_uchar_sat(fma(coeffs[0], src_pix4.R_COMP, fma(coeffs[1], src_pix4.G_COMP, fma(coeffs[2], src_pix4.B_COMP, 16.5f))));
+
+                float uv[2] = { fma(coeffs[3], src_pix1.R_COMP, fma(coeffs[4], src_pix1.G_COMP, fma(coeffs[5], src_pix1.B_COMP, 128.5f))),
+                                fma(coeffs[5], src_pix1.R_COMP, fma(coeffs[6], src_pix1.G_COMP, fma(coeffs[7], src_pix1.B_COMP, 128.5f))) };
+
+                udst[0] = convert_uchar_sat(uv[uidx]  );
+                vdst[0] = convert_uchar_sat(uv[1-uidx]);
+#endif
+                ++y;
+                src_index += 2*src_step;
+                ydst_index += 2*dst_step;
+            }
+        }
+    }
+}
+
+#endif
+
+__kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_offset,
+                          __global uchar* dstptr, int dst_step, int dst_offset,
+                          int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols / 2)
+    {
+        __global const uchar* src = srcptr + mad24(y, src_step, (x << 2) + src_offset);
+        __global uchar*       dst = dstptr + mad24(y, dst_step, mad24(x << 1, dcn, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows )
+            {
+                __constant float* coeffs = c_YUV2RGBCoeffs_420;
+
+#ifndef USE_OPTIMIZED_LOAD
+                float U = ((float) src[uidx]) - HALF_MAX_NUM;
+                float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX_NUM;
+                float y00 = max(0.f, ((float) src[yidx]) - 16.f) * coeffs[0];
+                float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0];
+#else
+                int load_src = *((__global int*) src);
+                float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff};
+                float U = vec_src[uidx] - HALF_MAX_NUM;
+                float V = vec_src[(2 + uidx) % 4] - HALF_MAX_NUM;
+                float y00 = max(0.f, vec_src[yidx] - 16.f) * coeffs[0];
+                float y01 = max(0.f, vec_src[yidx + 2] - 16.f) * coeffs[0];
+#endif
+
+                float ruv = fma(coeffs[4], V, 0.5f);
+                float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
+                float buv = fma(coeffs[1], U, 0.5f);
+
+                dst[2 - bidx] = convert_uchar_sat(y00 + ruv);
+                dst[1]        = convert_uchar_sat(y00 + guv);
+                dst[bidx]     = convert_uchar_sat(y00 + buv);
+#if dcn == 4
+                dst[3]        = 255;
+#endif
+
+                dst[dcn + 2 - bidx] = convert_uchar_sat(y01 + ruv);
+                dst[dcn + 1]        = convert_uchar_sat(y01 + guv);
+                dst[dcn + bidx]     = convert_uchar_sat(y01 + buv);
+#if dcn == 4
+                dst[7]        = 255;
+#endif
+            }
+            ++y;
+            src += src_step;
+            dst += dst_step;
+        }
+    }
+}
+
+///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
+
+__constant float c_RGB2YCrCbCoeffs_f[5] = {R2YF, G2YF, B2YF, YCRF, YCBF};
+__constant int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, YCRI, YCBI};
+
+__kernel void RGB2YCrCb(__global const uchar* srcptr, int src_step, int src_offset,
+                        __global uchar* dstptr, int dst_step, int dt_offset,
+                        int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
+                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
+                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
+
+#ifdef DEPTH_5
+                __constant float * coeffs = c_RGB2YCrCbCoeffs_f;
+                DATA_TYPE Y = fma(b, coeffs[2], fma(g, coeffs[1], r * coeffs[0]));
+                DATA_TYPE Cr = fma(r - Y, coeffs[3], HALF_MAX_NUM);
+                DATA_TYPE Cb = fma(b - Y, coeffs[4], HALF_MAX_NUM);
+#else
+                __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
+                int delta = HALF_MAX_NUM * (1 << yuv_shift);
+                int Y =  CV_DESCALE(mad24(b, coeffs[2], mad24(g, coeffs[1], mul24(r, coeffs[0]))), yuv_shift);
+                int Cr = CV_DESCALE(mad24(r - Y, coeffs[3], delta), yuv_shift);
+                int Cb = CV_DESCALE(mad24(b - Y, coeffs[4], delta), yuv_shift);
+#endif
+
+                dst[0] = SAT_CAST( Y );
+                dst[1] = SAT_CAST( Cr );
+                dst[2] = SAT_CAST( Cb );
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__constant float c_YCrCb2RGBCoeffs_f[4] = { CR2RF, CR2GF, CB2GF, CB2BF };
+__constant int   c_YCrCb2RGBCoeffs_i[4] = { CR2RI, CR2GI, CB2GI, CB2BI };
+
+__kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,
+                        __global uchar* dst, int dst_step, int dst_offset,
+                        int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_index);
+                __global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_index);
+
+                DATA_TYPE_4 src_pix = vload4(0, srcptr);
+                DATA_TYPE yp = src_pix.x, cr = src_pix.y, cb = src_pix.z;
+
+#ifdef DEPTH_5
+                __constant float * coeff = c_YCrCb2RGBCoeffs_f;
+                float r = fma(coeff[0], cr - HALF_MAX_NUM, yp);
+                float g = fma(coeff[1], cr - HALF_MAX_NUM, fma(coeff[2], cb - HALF_MAX_NUM, yp));
+                float b = fma(coeff[3], cb - HALF_MAX_NUM, yp);
+#else
+                __constant int * coeff = c_YCrCb2RGBCoeffs_i;
+                int r = yp + CV_DESCALE(coeff[0] * (cr - HALF_MAX_NUM), yuv_shift);
+                int g = yp + CV_DESCALE(mad24(coeff[1], cr - HALF_MAX_NUM, coeff[2] * (cb - HALF_MAX_NUM)), yuv_shift);
+                int b = yp + CV_DESCALE(coeff[3] * (cb - HALF_MAX_NUM), yuv_shift);
+#endif
+
+                dstptr[(bidx^2)] = SAT_CAST(r);
+                dstptr[1] = SAT_CAST(g);
+                dstptr[bidx] = SAT_CAST(b);
+#if dcn == 4
+                dstptr[3] = MAX_NUM;
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
--- a/modules/imgproc/src/opencl/cvtcolor.cl
+++ b/modules/imgproc/src/opencl/cvtcolor.cl