fixed wrong equivalence in YUV conversion (#7481)

* fixed wrong equivalence in YUV conversion * fixed channel order from YVU to YUV
2025-01-18 22:44:02 +08:00 · 2016-11-23 18:39:18 +04:00 · 2016-11-23 18:39:18 +04:00 · 7db43f9fff
commit 7db43f9fff
parent b8c875d4a1
2 changed files with 272 additions and 153 deletions
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@ -105,6 +105,39 @@

 namespace cv
 {
+//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
+const float B2YF = 0.114f;
+const float G2YF = 0.587f;
+const float R2YF = 0.299f;
+//to YCbCr
+const float YCBF = 0.564f; // == 1/2/(1-B2YF)
+const float YCRF = 0.713f; // == 1/2/(1-R2YF)
+const int YCBI = 9241;  // == YCBF*16384
+const int YCRI = 11682; // == YCRF*16384
+//to YUV
+const float B2UF = 0.492f;
+const float R2VF = 0.877f;
+const int B2UI = 8061;  // == B2UF*16384
+const int R2VI = 14369; // == R2VF*16384
+//from YUV
+const float U2BF = 2.032f;
+const float U2GF = -0.395f;
+const float V2GF = -0.581f;
+const float V2RF = 1.140f;
+const int U2BI = 33292;
+const int U2GI = -6472;
+const int V2GI = -9519;
+const int V2RI = 18678;
+//from YCrCb
+const float CB2BF = 1.773f;
+const float CB2GF = -0.344f;
+const float CR2GF = -0.714f;
+const float CR2RF = 1.403f;
+const int CB2BI = 29049;
+const int CB2GI = -5636;
+const int CR2GI = -11698;
+const int CR2RI = 22987;
+

 // computes cubic spline coefficients for a function: (xi=i, yi=f[i]), i=0..n
 template<typename _Tp> static void splineBuild(const _Tp* f, int n, _Tp* tab)
@ -499,9 +532,9 @@ struct IPPColor2GrayFunctor
    IPPColor2GrayFunctor(ippiColor2GrayFunc _func) :
        ippiColorToGray(_func)
    {
-        coeffs[0] = 0.114f;
-        coeffs[1] = 0.587f;
-        coeffs[2] = 0.299f;
+        coeffs[0] = B2YF;
+        coeffs[1] = G2YF;
+        coeffs[2] = R2YF;
    }
    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
    {
@ -1143,9 +1176,9 @@ enum
 {
    yuv_shift = 14,
    xyz_shift = 12,
-    R2Y = 4899,
-    G2Y = 9617,
-    B2Y = 1868,
+    R2Y = 4899, // == R2YF*16384
+    G2Y = 9617, // == G2YF*16384
+    B2Y = 1868, // == B2YF*16384
    BLOCK_SIZE = 256
 };

@ -1313,7 +1346,7 @@ template<typename _Tp> struct RGB2Gray

    RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
    {
-        static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
+        static const float coeffs0[] = { R2YF, G2YF, B2YF };
        memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
        if(blueIdx == 0)
            std::swap(coeffs[0], coeffs[2]);
@ -1460,7 +1493,7 @@ struct RGB2Gray<float>

    RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
    {
-        static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
+        static const float coeffs0[] = { R2YF, G2YF, B2YF };
        memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
        if(blueIdx == 0)
            std::swap(coeffs[0], coeffs[2]);
@ -1650,7 +1683,7 @@ struct RGB2Gray<float>

    RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
    {
-        static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
+        static const float coeffs0[] = { R2YF, G2YF, B2YF };
        memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
        if(blueIdx == 0)
            std::swap(coeffs[0], coeffs[2]);
@ -1772,16 +1805,18 @@ template<typename _Tp> struct RGB2YCrCb_f
 {
    typedef _Tp channel_type;

-    RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) : srccn(_srccn), blueIdx(_blueIdx)
+    RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF };
+        static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF };
+        memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0]));
        if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
    }

    void operator()(const _Tp* src, _Tp* dst, int n) const
    {
        int scn = srccn, bidx = blueIdx;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        const _Tp delta = ColorChannel<_Tp>::half();
        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
        n *= 3;
@ -1790,10 +1825,11 @@ template<typename _Tp> struct RGB2YCrCb_f
            _Tp Y = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
            _Tp Cr = saturate_cast<_Tp>((src[bidx^2] - Y)*C3 + delta);
            _Tp Cb = saturate_cast<_Tp>((src[bidx] - Y)*C4 + delta);
-            dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
+            dst[i] = Y; dst[i+1+yuvOrder] = Cr; dst[i+2-yuvOrder] = Cb;
        }
    }
    int srccn, blueIdx;
+    bool isCrCb;
    float coeffs[5];
 };

@ -1804,11 +1840,12 @@ struct RGB2YCrCb_f<float>
 {
    typedef float channel_type;

-    RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
-        srccn(_srccn), blueIdx(_blueIdx)
+    RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) :
+        srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF };
+        static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF };
+        memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0]));
        if(blueIdx==0)
            std::swap(coeffs[0], coeffs[2]);

@ -1823,6 +1860,7 @@ struct RGB2YCrCb_f<float>
    void operator()(const float * src, float * dst, int n) const
    {
        int scn = srccn, bidx = blueIdx, i = 0;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        const float delta = ColorChannel<float>::half();
        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
        n *= 3;
@ -1832,8 +1870,8 @@ struct RGB2YCrCb_f<float>
            {
                float32x4x3_t v_src = vld3q_f32(src), v_dst;
                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
-                v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
-                v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
+                v_dst.val[1+yuvOrder] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
+                v_dst.val[2-yuvOrder] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);

                vst3q_f32(dst + i, v_dst);
            }
@ -1843,8 +1881,8 @@ struct RGB2YCrCb_f<float>
                float32x4x4_t v_src = vld4q_f32(src);
                float32x4x3_t v_dst;
                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
-                v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
-                v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
+                v_dst.val[1+yuvOrder] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
+                v_dst.val[2-yuvOrder] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);

                vst3q_f32(dst + i, v_dst);
            }
@ -1854,10 +1892,11 @@ struct RGB2YCrCb_f<float>
            float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
            float Cr = (src[bidx^2] - Y)*C3 + delta;
            float Cb = (src[bidx] - Y)*C4 + delta;
-            dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
+            dst[i] = Y; dst[i+1+yuvOrder] = Cr; dst[i+2-yuvOrder] = Cb;
        }
    }
    int srccn, blueIdx;
+    bool isCrCb;
    float coeffs[5];
    float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
 };
@ -1869,11 +1908,12 @@ struct RGB2YCrCb_f<float>
 {
    typedef float channel_type;

-    RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
-        srccn(_srccn), blueIdx(_blueIdx)
+    RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) :
+        srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF };
+        static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF };
+        memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0]));
        if (blueIdx==0)
            std::swap(coeffs[0], coeffs[2]);

@ -1901,6 +1941,7 @@ struct RGB2YCrCb_f<float>
    void operator()(const float * src, float * dst, int n) const
    {
        int scn = srccn, bidx = blueIdx, i = 0;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        const float delta = ColorChannel<float>::half();
        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
        n *= 3;
@ -1934,14 +1975,19 @@ struct RGB2YCrCb_f<float>
                process(v_r1, v_g1, v_b1,
                        v_y1, v_cr1, v_cb1);

-                _mm_interleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
+                if(isCrCb)
+                    _mm_interleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
+                else //YUV
+                {
+                    _mm_interleave_ps(v_y0, v_y1, v_cb0, v_cb1, v_cr0, v_cr1);
+                }

                _mm_storeu_ps(dst + i, v_y0);
                _mm_storeu_ps(dst + i + 4, v_y1);
-                _mm_storeu_ps(dst + i + 8, v_cr0);
-                _mm_storeu_ps(dst + i + 12, v_cr1);
-                _mm_storeu_ps(dst + i + 16, v_cb0);
-                _mm_storeu_ps(dst + i + 20, v_cb1);
+                _mm_storeu_ps(dst + i + 8  + yuvOrder*8, v_cr0);
+                _mm_storeu_ps(dst + i + 12 + yuvOrder*8, v_cr1);
+                _mm_storeu_ps(dst + i + 16 - yuvOrder*8, v_cb0);
+                _mm_storeu_ps(dst + i + 20 - yuvOrder*8, v_cb1);
            }
        }

@ -1950,10 +1996,11 @@ struct RGB2YCrCb_f<float>
            float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
            float Cr = (src[bidx^2] - Y)*C3 + delta;
            float Cb = (src[bidx] - Y)*C4 + delta;
-            dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
+            dst[i] = Y; dst[i+1+yuvOrder] = Cr; dst[i+2-yuvOrder] = Cb;
        }
    }
    int srccn, blueIdx;
+    bool isCrCb;
    float coeffs[5];
    __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
    bool haveSIMD;
@ -1965,16 +2012,18 @@ template<typename _Tp> struct RGB2YCrCb_i
 {
    typedef _Tp channel_type;

-    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
-        : srccn(_srccn), blueIdx(_blueIdx)
+    RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb)
+        : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI };
+        static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI };
+        memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0]));
        if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
    }
    void operator()(const _Tp* src, _Tp* dst, int n) const
    {
        int scn = srccn, bidx = blueIdx;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
        int delta = ColorChannel<_Tp>::half()*(1 << yuv_shift);
        n *= 3;
@ -1984,11 +2033,12 @@ template<typename _Tp> struct RGB2YCrCb_i
            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
            dst[i] = saturate_cast<_Tp>(Y);
-            dst[i+1] = saturate_cast<_Tp>(Cr);
-            dst[i+2] = saturate_cast<_Tp>(Cb);
+            dst[i+1+yuvOrder] = saturate_cast<_Tp>(Cr);
+            dst[i+2-yuvOrder] = saturate_cast<_Tp>(Cb);
        }
    }
    int srccn, blueIdx;
+    bool isCrCb;
    int coeffs[5];
 };

@ -1999,11 +2049,12 @@ struct RGB2YCrCb_i<uchar>
 {
    typedef uchar channel_type;

-    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
-        : srccn(_srccn), blueIdx(_blueIdx)
+    RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb)
+        : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI };
+        static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI };
+        memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0]));
        if (blueIdx==0)
            std::swap(coeffs[0], coeffs[2]);

@ -2019,6 +2070,7 @@ struct RGB2YCrCb_i<uchar>
    void operator()(const uchar * src, uchar * dst, int n) const
    {
        int scn = srccn, bidx = blueIdx, i = 0;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
        int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
        n *= 3;
@ -2067,8 +2119,8 @@ struct RGB2YCrCb_i<uchar>
            v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);

            v_dst.val[0] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
-            v_dst.val[1] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1)));
-            v_dst.val[2] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1)));
+            v_dst.val[1+yuvOrder] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1)));
+            v_dst.val[2-yuvOrder] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1)));

            vst3_u8(dst + i, v_dst);
        }
@ -2079,11 +2131,12 @@ struct RGB2YCrCb_i<uchar>
            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
            dst[i] = saturate_cast<uchar>(Y);
-            dst[i+1] = saturate_cast<uchar>(Cr);
-            dst[i+2] = saturate_cast<uchar>(Cb);
+            dst[i+1+yuvOrder] = saturate_cast<uchar>(Cr);
+            dst[i+2-yuvOrder] = saturate_cast<uchar>(Cb);
        }
    }
    int srccn, blueIdx, coeffs[5];
+    bool isCrCb;
    int16x4_t v_c0, v_c1, v_c2;
    int32x4_t v_c3, v_c4, v_delta, v_delta2;
 };
@ -2093,11 +2146,12 @@ struct RGB2YCrCb_i<ushort>
 {
    typedef ushort channel_type;

-    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
-        : srccn(_srccn), blueIdx(_blueIdx)
+    RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb)
+        : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI };
+        static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI };
+        memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0]));
        if (blueIdx==0)
            std::swap(coeffs[0], coeffs[2]);

@ -2113,6 +2167,7 @@ struct RGB2YCrCb_i<ushort>
    void operator()(const ushort * src, ushort * dst, int n) const
    {
        int scn = srccn, bidx = blueIdx, i = 0;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
        int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
        n *= 3;
@ -2155,8 +2210,8 @@ struct RGB2YCrCb_i<ushort>
            v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);

            v_dst.val[0] = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
-            v_dst.val[1] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1));
-            v_dst.val[2] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1));
+            v_dst.val[1+yuvOrder] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1));
+            v_dst.val[2-yuvOrder] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1));

            vst3q_u16(dst + i, v_dst);
        }
@ -2189,8 +2244,8 @@ struct RGB2YCrCb_i<ushort>
            v_Cb = vshrq_n_s32(vaddq_s32(v_Cb, v_delta2), yuv_shift);

            v_dst.val[0] = vqmovun_s32(v_Y);
-            v_dst.val[1] = vqmovun_s32(v_Cr);
-            v_dst.val[2] = vqmovun_s32(v_Cb);
+            v_dst.val[1+yuvOrder] = vqmovun_s32(v_Cr);
+            v_dst.val[2-yuvOrder] = vqmovun_s32(v_Cb);

            vst3_u16(dst + i, v_dst);
        }
@ -2201,11 +2256,12 @@ struct RGB2YCrCb_i<ushort>
            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
            dst[i] = saturate_cast<ushort>(Y);
-            dst[i+1] = saturate_cast<ushort>(Cr);
-            dst[i+2] = saturate_cast<ushort>(Cb);
+            dst[i+1+yuvOrder] = saturate_cast<ushort>(Cr);
+            dst[i+2-yuvOrder] = saturate_cast<ushort>(Cb);
        }
    }
    int srccn, blueIdx, coeffs[5];
+    bool isCrCb;
    int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2;
 };

@ -2216,11 +2272,12 @@ struct RGB2YCrCb_i<uchar>
 {
    typedef uchar channel_type;

-    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
-        : srccn(_srccn), blueIdx(_blueIdx)
+    RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb)
+        : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI };
+        static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI };
+        memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0]));
        if (blueIdx==0)
            std::swap(coeffs[0], coeffs[2]);

@ -2229,7 +2286,10 @@ struct RGB2YCrCb_i<uchar>
        v_delta_32 = _mm_set1_epi32(delta);
        short delta2 = 1 + ColorChannel<uchar>::half() * 2;
        v_coeff = _mm_set_epi16(delta2, (short)coeffs[4], delta2, (short)coeffs[3], delta2, (short)coeffs[4], delta2, (short)coeffs[3]);
-        v_shuffle2 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xf, 0xe, 0xc, 0xb, 0xa, 0x8, 0x7, 0x6, 0x4, 0x3, 0x2, 0x0);
+        if(isCrCb)
+            v_shuffle2 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xf, 0xe, 0xc, 0xb, 0xa, 0x8, 0x7, 0x6, 0x4, 0x3, 0x2, 0x0);
+        else //if YUV
+            v_shuffle2 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xe, 0xf, 0xc, 0xa, 0xb, 0x8, 0x6, 0x7, 0x4, 0x2, 0x3, 0x0);
        haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
    }

@ -2283,6 +2343,7 @@ struct RGB2YCrCb_i<uchar>
    void operator()(const uchar * src, uchar * dst, int n) const
    {
        int scn = srccn, bidx = blueIdx, i = 0;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
        int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
        n *= 3;
@ -2359,8 +2420,8 @@ struct RGB2YCrCb_i<uchar>
            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
            dst[i] = saturate_cast<uchar>(Y);
-            dst[i+1] = saturate_cast<uchar>(Cr);
-            dst[i+2] = saturate_cast<uchar>(Cb);
+            dst[i+1+yuvOrder] = saturate_cast<uchar>(Cr);
+            dst[i+2-yuvOrder] = saturate_cast<uchar>(Cb);
        }
    }

@ -2368,6 +2429,7 @@ struct RGB2YCrCb_i<uchar>
    __m128i v_coeff;
    __m128i v_shuffle2;
    int srccn, blueIdx, coeffs[5];
+    bool isCrCb;
    bool haveSIMD;
 };

@ -2376,11 +2438,12 @@ struct RGB2YCrCb_i<ushort>
 {
    typedef ushort channel_type;

-    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
-        : srccn(_srccn), blueIdx(_blueIdx)
+    RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb)
+        : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI };
+        static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI };
+        memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0]));
        if (blueIdx==0)
            std::swap(coeffs[0], coeffs[2]);

@ -2437,6 +2500,7 @@ struct RGB2YCrCb_i<ushort>
    void operator()(const ushort * src, ushort * dst, int n) const
    {
        int scn = srccn, bidx = blueIdx, i = 0;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
        int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
        n *= 3;
@ -2471,14 +2535,17 @@ struct RGB2YCrCb_i<ushort>
                process(v_r1, v_g1, v_b1,
                        v_y1, v_cr1, v_cb1);

-                _mm_interleave_epi16(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
+                if(isCrCb)
+                    _mm_interleave_epi16(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
+                else //YUV
+                    _mm_interleave_epi16(v_y0, v_y1, v_cb0, v_cb1, v_cr0, v_cr1);

                _mm_storeu_si128((__m128i *)(dst + i), v_y0);
                _mm_storeu_si128((__m128i *)(dst + i + 8), v_y1);
-                _mm_storeu_si128((__m128i *)(dst + i + 16), v_cr0);
-                _mm_storeu_si128((__m128i *)(dst + i + 24), v_cr1);
-                _mm_storeu_si128((__m128i *)(dst + i + 32), v_cb0);
-                _mm_storeu_si128((__m128i *)(dst + i + 40), v_cb1);
+                _mm_storeu_si128((__m128i *)(dst + i + 16 + yuvOrder*16), v_cr0);
+                _mm_storeu_si128((__m128i *)(dst + i + 24 + yuvOrder*16), v_cr1);
+                _mm_storeu_si128((__m128i *)(dst + i + 32 - yuvOrder*16), v_cb0);
+                _mm_storeu_si128((__m128i *)(dst + i + 40 - yuvOrder*16), v_cb1);
            }
        }

@ -2488,12 +2555,13 @@ struct RGB2YCrCb_i<ushort>
            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
            dst[i] = saturate_cast<ushort>(Y);
-            dst[i+1] = saturate_cast<ushort>(Cr);
-            dst[i+2] = saturate_cast<ushort>(Cb);
+            dst[i+1+yuvOrder] = saturate_cast<ushort>(Cr);
+            dst[i+2-yuvOrder] = saturate_cast<ushort>(Cb);
        }
    }

    int srccn, blueIdx, coeffs[5];
+    bool isCrCb;
    __m128i v_c0, v_c1, v_c2;
    __m128i v_c3, v_c4, v_delta, v_delta2;
    __m128i v_zero;
@ -2506,23 +2574,25 @@ template<typename _Tp> struct YCrCb2RGB_f
 {
    typedef _Tp channel_type;

-    YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
-        : dstcn(_dstcn), blueIdx(_blueIdx)
+    YCrCb2RGB_f(int _dstcn, int _blueIdx, bool _isCrCb)
+        : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+        static const float coeffs_cbr[] = {CR2RF, CR2GF, CB2GF, CB2BF};
+        static const float coeffs_yuv[] = { V2RF,  V2GF,  U2GF,  U2BF};
+        memcpy(coeffs, isCrCb ? coeffs_cbr : coeffs_yuv, 4*sizeof(coeffs[0]));
    }
    void operator()(const _Tp* src, _Tp* dst, int n) const
    {
        int dcn = dstcn, bidx = blueIdx;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
        n *= 3;
        for(int i = 0; i < n; i += 3, dst += dcn)
        {
            _Tp Y = src[i];
-            _Tp Cr = src[i+1];
-            _Tp Cb = src[i+2];
+            _Tp Cr = src[i+1+yuvOrder];
+            _Tp Cb = src[i+2-yuvOrder];

            _Tp b = saturate_cast<_Tp>(Y + (Cb - delta)*C3);
            _Tp g = saturate_cast<_Tp>(Y + (Cb - delta)*C2 + (Cr - delta)*C1);
@ -2534,6 +2604,7 @@ template<typename _Tp> struct YCrCb2RGB_f
        }
    }
    int dstcn, blueIdx;
+    bool isCrCb;
    float coeffs[4];
 };

@ -2544,11 +2615,12 @@ struct YCrCb2RGB_f<float>
 {
    typedef float channel_type;

-    YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
-        : dstcn(_dstcn), blueIdx(_blueIdx)
+    YCrCb2RGB_f(int _dstcn, int _blueIdx, bool _isCrCb)
+        : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+        static const float coeffs_cbr[] = {CR2RF, CR2GF, CB2GF, CB2BF};
+        static const float coeffs_yuv[] = { V2RF,  V2GF,  U2GF,  U2BF};
+        memcpy(coeffs, isCrCb ? coeffs_cbr : coeffs_yuv, 4*sizeof(coeffs[0]));

        v_c0 = vdupq_n_f32(coeffs[0]);
        v_c1 = vdupq_n_f32(coeffs[1]);
@ -2561,6 +2633,7 @@ struct YCrCb2RGB_f<float>
    void operator()(const float* src, float* dst, int n) const
    {
        int dcn = dstcn, bidx = blueIdx, i = 0;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
        n *= 3;
@ -2569,7 +2642,7 @@ struct YCrCb2RGB_f<float>
            for ( ; i <= n - 12; i += 12, dst += 12)
            {
                float32x4x3_t v_src = vld3q_f32(src + i), v_dst;
-                float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
+                float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1+yuvOrder], v_Cb = v_src.val[2-yuvOrder];

                v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
                v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
@ -2582,7 +2655,7 @@ struct YCrCb2RGB_f<float>
            {
                float32x4x3_t v_src = vld3q_f32(src + i);
                float32x4x4_t v_dst;
-                float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
+                float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1+yuvOrder], v_Cb = v_src.val[2-yuvOrder];

                v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
                v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
@ -2594,7 +2667,7 @@ struct YCrCb2RGB_f<float>

        for ( ; i < n; i += 3, dst += dcn)
        {
-            float Y = src[i], Cr = src[i+1], Cb = src[i+2];
+            float Y = src[i], Cr = src[i+1+yuvOrder], Cb = src[i+2-yuvOrder];

            float b = Y + (Cb - delta)*C3;
            float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
@ -2606,6 +2679,7 @@ struct YCrCb2RGB_f<float>
        }
    }
    int dstcn, blueIdx;
+    bool isCrCb;
    float coeffs[4];
    float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
 };
@ -2617,11 +2691,12 @@ struct YCrCb2RGB_f<float>
 {
    typedef float channel_type;

-    YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
-        : dstcn(_dstcn), blueIdx(_blueIdx)
+    YCrCb2RGB_f(int _dstcn, int _blueIdx, bool _isCrCb)
+        : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+        static const float coeffs_cbr[] = {CR2RF, CR2GF, CB2GF, CB2BF};
+        static const float coeffs_yuv[] = { V2RF,  V2GF,  U2GF,  U2BF};
+        memcpy(coeffs, isCrCb ? coeffs_cbr : coeffs_yuv, 4*sizeof(coeffs[0]));

        v_c0 = _mm_set1_ps(coeffs[0]);
        v_c1 = _mm_set1_ps(coeffs[1]);
@ -2639,6 +2714,9 @@ struct YCrCb2RGB_f<float>
        v_cb = _mm_sub_ps(v_cb, v_delta);
        v_cr = _mm_sub_ps(v_cr, v_delta);

+        if (!isCrCb)
+            std::swap(v_cb, v_cr);
+
        v_b = _mm_mul_ps(v_cb, v_c3);
        v_g = _mm_add_ps(_mm_mul_ps(v_cb, v_c2), _mm_mul_ps(v_cr, v_c1));
        v_r = _mm_mul_ps(v_cr, v_c0);
@ -2654,6 +2732,7 @@ struct YCrCb2RGB_f<float>
    void operator()(const float* src, float* dst, int n) const
    {
        int dcn = dstcn, bidx = blueIdx, i = 0;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
        n *= 3;
@ -2704,7 +2783,7 @@ struct YCrCb2RGB_f<float>

        for ( ; i < n; i += 3, dst += dcn)
        {
-            float Y = src[i], Cr = src[i+1], Cb = src[i+2];
+            float Y = src[i], Cr = src[i+1+yuvOrder], Cb = src[i+2-yuvOrder];

            float b = Y + (Cb - delta)*C3;
            float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
@ -2716,6 +2795,7 @@ struct YCrCb2RGB_f<float>
        }
    }
    int dstcn, blueIdx;
+    bool isCrCb;
    float coeffs[4];

    __m128 v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
@ -2728,24 +2808,26 @@ template<typename _Tp> struct YCrCb2RGB_i
 {
    typedef _Tp channel_type;

-    YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
-        : dstcn(_dstcn), blueIdx(_blueIdx)
+    YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb)
+        : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const int coeffs0[] = {22987, -11698, -5636, 29049};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+        static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI};
+        static const int coeffs_yuv[] = {  V2RI,  V2GI,  U2GI, U2BI };
+        memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 4*sizeof(coeffs[0]));
    }

    void operator()(const _Tp* src, _Tp* dst, int n) const
    {
        int dcn = dstcn, bidx = blueIdx;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
        n *= 3;
        for(int i = 0; i < n; i += 3, dst += dcn)
        {
            _Tp Y = src[i];
-            _Tp Cr = src[i+1];
-            _Tp Cb = src[i+2];
+            _Tp Cr = src[i+1+yuvOrder];
+            _Tp Cb = src[i+2-yuvOrder];

            int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
@ -2759,6 +2841,7 @@ template<typename _Tp> struct YCrCb2RGB_i
        }
    }
    int dstcn, blueIdx;
+    bool isCrCb;
    int coeffs[4];
 };

@ -2769,11 +2852,12 @@ struct YCrCb2RGB_i<uchar>
 {
    typedef uchar channel_type;

-    YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
-        : dstcn(_dstcn), blueIdx(_blueIdx)
+    YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb)
+        : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const int coeffs0[] = {22987, -11698, -5636, 29049};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+        static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI};
+        static const int coeffs_yuv[] = {  V2RI,  V2GI,  U2GI, U2BI };
+        memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 4*sizeof(coeffs[0]));

        v_c0 = vdupq_n_s32(coeffs[0]);
        v_c1 = vdupq_n_s32(coeffs[1]);
@ -2787,6 +2871,7 @@ struct YCrCb2RGB_i<uchar>
    void operator()(const uchar* src, uchar* dst, int n) const
    {
        int dcn = dstcn, bidx = blueIdx, i = 0;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
        n *= 3;
@ -2800,8 +2885,8 @@ struct YCrCb2RGB_i<uchar>
            v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));

            int16x4_t v_Y = vget_low_s16(v_src16.val[0]),
-                      v_Cr = vget_low_s16(v_src16.val[1]),
-                      v_Cb = vget_low_s16(v_src16.val[2]);
+                      v_Cr = vget_low_s16(v_src16.val[1+yuvOrder]),
+                      v_Cb = vget_low_s16(v_src16.val[2-yuvOrder]);

            int32x4_t v_b0 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
            v_b0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
@ -2811,8 +2896,8 @@ struct YCrCb2RGB_i<uchar>
            v_r0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);

            v_Y = vget_high_s16(v_src16.val[0]);
-            v_Cr = vget_high_s16(v_src16.val[1]);
-            v_Cb = vget_high_s16(v_src16.val[2]);
+            v_Cr = vget_high_s16(v_src16.val[1+yuvOrder]);
+            v_Cb = vget_high_s16(v_src16.val[2-yuvOrder]);

            int32x4_t v_b1 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
            v_b1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
@ -2847,8 +2932,8 @@ struct YCrCb2RGB_i<uchar>
        for ( ; i < n; i += 3, dst += dcn)
        {
            uchar Y = src[i];
-            uchar Cr = src[i+1];
-            uchar Cb = src[i+2];
+            uchar Cr = src[i+1+yuvOrder];
+            uchar Cb = src[i+2-yuvOrder];

            int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
@ -2862,6 +2947,7 @@ struct YCrCb2RGB_i<uchar>
        }
    }
    int dstcn, blueIdx;
+    bool isCrCb;
    int coeffs[4];

    int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2;
@ -2874,11 +2960,12 @@ struct YCrCb2RGB_i<ushort>
 {
    typedef ushort channel_type;

-    YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
-        : dstcn(_dstcn), blueIdx(_blueIdx)
+    YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb)
+        : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const int coeffs0[] = {22987, -11698, -5636, 29049};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+        static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI};
+        static const int coeffs_yuv[] = {  V2RI,  V2GI,  U2GI, U2BI };
+        memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 4*sizeof(coeffs[0]));

        v_c0 = vdupq_n_s32(coeffs[0]);
        v_c1 = vdupq_n_s32(coeffs[1]);
@ -2893,6 +2980,7 @@ struct YCrCb2RGB_i<ushort>
    void operator()(const ushort* src, ushort* dst, int n) const
    {
        int dcn = dstcn, bidx = blueIdx, i = 0;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
        n *= 3;
@ -2902,8 +2990,8 @@ struct YCrCb2RGB_i<ushort>
            uint16x8x3_t v_src = vld3q_u16(src + i);

            int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
-                      v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
-                      v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
+                      v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1+yuvOrder]))),
+                      v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2-yuvOrder])));

            int32x4_t v_b0 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
            v_b0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
@ -2913,8 +3001,8 @@ struct YCrCb2RGB_i<ushort>
            v_r0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);

            v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))),
-            v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))),
-            v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
+            v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1+yuvOrder]))),
+            v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2-yuvOrder])));

            int32x4_t v_b1 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
            v_b1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
@ -2951,8 +3039,8 @@ struct YCrCb2RGB_i<ushort>
            uint16x4x3_t v_src = vld3_u16(src + i);

            int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
-                      v_Cr = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
-                      v_Cb = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
+                      v_Cr = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1+yuvOrder])),
+                      v_Cb = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2-yuvOrder]));

            int32x4_t v_b = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
            v_b = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b, v_delta2), yuv_shift), v_Y);
@ -2987,8 +3075,8 @@ struct YCrCb2RGB_i<ushort>
        for ( ; i < n; i += 3, dst += dcn)
        {
            ushort Y = src[i];
-            ushort Cr = src[i+1];
-            ushort Cb = src[i+2];
+            ushort Cr = src[i+1+yuvOrder];
+            ushort Cb = src[i+2-yuvOrder];

            int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
@ -3002,6 +3090,7 @@ struct YCrCb2RGB_i<ushort>
        }
    }
    int dstcn, blueIdx;
+    bool isCrCb;
    int coeffs[4];

    int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2, v_delta;
@ -3016,11 +3105,12 @@ struct YCrCb2RGB_i<uchar>
 {
    typedef uchar channel_type;

-    YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
-        : dstcn(_dstcn), blueIdx(_blueIdx)
+    YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb)
+        : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb)
    {
-        static const int coeffs0[] = {22987, -11698, -5636, 29049};
-        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+        static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI};
+        static const int coeffs_yuv[] = {  V2RI,  V2GI,  U2GI, U2BI };
+        memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 4*sizeof(coeffs[0]));

        v_c0 = _mm_set1_epi16((short)coeffs[0]);
        v_c1 = _mm_set1_epi16((short)coeffs[1]);
@ -3033,7 +3123,9 @@ struct YCrCb2RGB_i<uchar>
        uchar alpha = ColorChannel<uchar>::max();
        v_alpha = _mm_set1_epi8(*(char *)&alpha);

-        useSSE = coeffs[0] <= std::numeric_limits<short>::max();
+        // when using YUV, one of coefficients is bigger than std::numeric_limits<short>::max(),
+        //which is not appropriate for SSE
+        useSSE = isCrCb;
        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
    }

@ -3152,6 +3244,7 @@ struct YCrCb2RGB_i<uchar>
    void operator()(const uchar* src, uchar* dst, int n) const
    {
        int dcn = dstcn, bidx = blueIdx, i = 0;
+        int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
        const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
        n *= 3;
@ -3316,8 +3409,8 @@ struct YCrCb2RGB_i<uchar>
        for ( ; i < n; i += 3, dst += dcn)
        {
            uchar Y = src[i];
-            uchar Cr = src[i+1];
-            uchar Cb = src[i+2];
+            uchar Cr = src[i+1+yuvOrder];
+            uchar Cb = src[i+2-yuvOrder];

            int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
@ -3332,6 +3425,7 @@ struct YCrCb2RGB_i<uchar>
    }
    int dstcn, blueIdx;
    int coeffs[4];
+    bool isCrCb;
    bool useSSE, haveSIMD;

    __m128i v_c0, v_c1, v_c2, v_c3, v_delta2;
@ -7992,7 +8086,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
    case COLOR_RGB2YUV:
    {
        CV_Assert(scn == 3 || scn == 4);
-        bidx = code == COLOR_RGB2YUV ? 0 : 2;
+        bidx = code == COLOR_RGB2YUV ? 2 : 0;
        dcn = 3;
        k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc,
                 opts + format("-D dcn=3 -D bidx=%d", bidx));
@ -8001,9 +8095,9 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
    case COLOR_YUV2BGR:
    case COLOR_YUV2RGB:
    {
-        if(dcn < 0) dcn = 3;
+        if(dcn <= 0) dcn = 3;
        CV_Assert(dcn == 3 || dcn == 4);
-        bidx = code == COLOR_YUV2RGB ? 0 : 2;
+        bidx = code == COLOR_YUV2RGB ? 2 : 0;
        k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc,
                 opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
        break;
@ -8858,17 +8952,13 @@ void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
    }
 #endif

-    static const float yuv_f[] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
-    static const int yuv_i[] = { B2Y, G2Y, R2Y, 8061, 14369 };
-    const float* coeffs_f = isCbCr ? 0 : yuv_f;
-    const int* coeffs_i = isCbCr ? 0 : yuv_i;
    int blueIdx = swapBlue ? 2 : 0;
    if( depth == CV_8U )
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i<uchar>(scn, blueIdx, coeffs_i));
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i<uchar>(scn, blueIdx, isCbCr));
    else if( depth == CV_16U )
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i<ushort>(scn, blueIdx, coeffs_i));
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i<ushort>(scn, blueIdx, isCbCr));
    else
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_f<float>(scn, blueIdx, coeffs_f));
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_f<float>(scn, blueIdx, isCbCr));
 }

 void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
@ -8914,17 +9004,13 @@ void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
    }
 #endif

-    static const float yuv_f[] = { 2.032f, -0.395f, -0.581f, 1.140f };
-    static const int yuv_i[] = { 33292, -6472, -9519, 18678 };
-    const float* coeffs_f = isCbCr ? 0 : yuv_f;
-    const int* coeffs_i = isCbCr ? 0 : yuv_i;
    int blueIdx = swapBlue ? 2 : 0;
    if( depth == CV_8U )
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i<uchar>(dcn, blueIdx, coeffs_i));
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i<uchar>(dcn, blueIdx, isCbCr));
    else if( depth == CV_16U )
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i<ushort>(dcn, blueIdx, coeffs_i));
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i<ushort>(dcn, blueIdx, isCbCr));
    else
-        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_f<float>(dcn, blueIdx, coeffs_f));
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_f<float>(dcn, blueIdx, isCbCr));
 }

 void cvtBGRtoXYZ(const uchar * src_data, size_t src_step,
--- a/modules/imgproc/src/opencl/cvtcolor.cl
+++ b/modules/imgproc/src/opencl/cvtcolor.cl
@ -84,6 +84,39 @@ enum
    BLOCK_SIZE = 256
 };

+//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
+#define B2YF 0.114f
+#define G2YF 0.587f
+#define R2YF 0.299f
+//to YCbCr
+#define YCBF 0.564f
+#define YCRF 0.713f
+#define YCBI 9241
+#define YCRI 11682
+//to YUV
+#define B2UF 0.492f
+#define R2VF 0.877f
+#define B2UI 8061
+#define R2VI 14369
+//from YUV
+#define U2BF 2.032f
+#define U2GF -0.395f
+#define V2GF -0.581f
+#define V2RF 1.140f
+#define U2BI 33292
+#define U2GI -6472
+#define V2GI -9519
+#define V2RI 18678
+//from YCrCb
+#define CR2RF 1.403f
+#define CB2GF -0.344f
+#define CR2GF -0.714f
+#define CB2BF 1.773f
+#define CR2RI 22987
+#define CB2GI -5636
+#define CR2GI -11698
+#define CB2BI 29049
+
 #define scnbytes ((int)sizeof(DATA_TYPE)*scn)
 #define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)

@ -151,7 +184,7 @@ __kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offs
                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
                DATA_TYPE_4 src_pix = vload4(0, src);
 #ifdef DEPTH_5
-                dst[0] = fma(src_pix.B_COMP, 0.114f, fma(src_pix.G_COMP, 0.587f, src_pix.R_COMP * 0.299f));
+                dst[0] = fma(src_pix.B_COMP, B2YF, fma(src_pix.G_COMP, G2YF, src_pix.R_COMP * R2YF));
 #else
                dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, mul24(src_pix.R_COMP, R2Y))), yuv_shift);
 #endif
@ -201,8 +234,8 @@ __kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offs

 ///////////////////////////////////// RGB <-> YUV //////////////////////////////////////

-__constant float c_RGB2YUVCoeffs_f[5]  = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
-__constant int   c_RGB2YUVCoeffs_i[5]  = { B2Y, G2Y, R2Y, 8061, 14369 };
+__constant float c_RGB2YUVCoeffs_f[5]  = { B2YF, G2YF, R2YF, B2UF, R2VF };
+__constant int   c_RGB2YUVCoeffs_i[5]  = { B2Y, G2Y, R2Y, B2UI, R2VI };

 __kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset,
                      __global uchar* dstptr, int dst_step, int dt_offset,
@ -251,8 +284,8 @@ __kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset
    }
 }

-__constant float c_YUV2RGBCoeffs_f[4] = { 2.032f, -0.395f, -0.581f, 1.140f };
-__constant int   c_YUV2RGBCoeffs_i[4] = { 33292, -6472, -9519, 18678 };
+__constant float c_YUV2RGBCoeffs_f[4] = { U2BF, U2GF, V2GF, V2RF };
+__constant int   c_YUV2RGBCoeffs_i[4] = { U2BI, U2GI, V2GI, V2RI };

 __kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
                      __global uchar* dstptr, int dst_step, int dt_offset,
@ -624,8 +657,8 @@ __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_of

 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////

-__constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
-__constant int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
+__constant float c_RGB2YCrCbCoeffs_f[5] = {R2YF, G2YF, B2YF, YCRF, YCBF};
+__constant int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, YCRI, YCBI};

 __kernel void RGB2YCrCb(__global const uchar* srcptr, int src_step, int src_offset,
                        __global uchar* dstptr, int dst_step, int dt_offset,
@ -674,8 +707,8 @@ __kernel void RGB2YCrCb(__global const uchar* srcptr, int src_step, int src_offs
    }
 }

-__constant float c_YCrCb2RGBCoeffs_f[4] = { 1.403f, -0.714f, -0.344f, 1.773f };
-__constant int   c_YCrCb2RGBCoeffs_i[4] = { 22987, -11698, -5636, 29049 };
+__constant float c_YCrCb2RGBCoeffs_f[4] = { CR2RF, CR2GF, CB2GF, CB2BF };
+__constant int   c_YCrCb2RGBCoeffs_i[4] = { CR2RI, CR2GI, CB2GI, CB2BI };

 __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,
                        __global uchar* dst, int dst_step, int dst_offset,