Merge pull request #13440 from savuor:rgb_wide

* conversions of color_rgb.cpp vectorized * CL impl: coeffs updated * unused constants removed * CUDA color coeffs updated
2024-11-30 06:10:02 +08:00 · 2018-12-17 17:22:38 +03:00 · 2018-12-17 17:22:38 +03:00 · 493611ace5
commit 493611ace5
parent e058febaad
3 changed files with 764 additions and 823 deletions
--- a/modules/core/include/opencv2/core/cuda/detail/color_detail.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/color_detail.hpp
@ -92,13 +92,51 @@ namespace cv { namespace cuda { namespace device
            return vec.w;
        }

+        //constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
+        const float B2YF = 0.114f;
+        const float G2YF = 0.587f;
+        const float R2YF = 0.299f;
+
+        //to YCbCr
+        const float YCBF = 0.564f; // == 1/2/(1-B2YF)
+        const float YCRF = 0.713f; // == 1/2/(1-R2YF)
+        const int YCBI = 9241;  // == YCBF*16384
+        const int YCRI = 11682; // == YCRF*16384
+        //to YUV
+        const float B2UF = 0.492f;
+        const float R2VF = 0.877f;
+        const int B2UI = 8061;  // == B2UF*16384
+        const int R2VI = 14369; // == R2VF*16384
+        //from YUV
+        const float U2BF = 2.032f;
+        const float U2GF = -0.395f;
+        const float V2GF = -0.581f;
+        const float V2RF = 1.140f;
+        const int U2BI = 33292;
+        const int U2GI = -6472;
+        const int V2GI = -9519;
+        const int V2RI = 18678;
+        //from YCrCb
+        const float CB2BF = 1.773f;
+        const float CB2GF = -0.344f;
+        const float CR2GF = -0.714f;
+        const float CR2RF = 1.403f;
+        const int CB2BI = 29049;
+        const int CB2GI = -5636;
+        const int CR2GI = -11698;
+        const int CR2RI = 22987;
+
        enum
        {
            yuv_shift  = 14,
            xyz_shift  = 12,
+            gray_shift = 15,
            R2Y        = 4899,
            G2Y        = 9617,
            B2Y        = 1868,
+            RY15 =  9798, // == R2YF*32768 + 0.5
+            GY15 = 19235, // == G2YF*32768 + 0.5
+            BY15 =  3735, // == B2YF*32768 + 0.5
            BLOCK_SIZE = 256
        };
    }
@ -406,7 +444,7 @@ namespace cv { namespace cuda { namespace device
        {
            static __device__ __forceinline__ uchar cvt(uint t)
            {
-                return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 3) & 0xfc) * G2Y + ((t >> 8) & 0xf8) * R2Y, yuv_shift);
+                return (uchar)CV_DESCALE(((t << 3) & 0xf8) * BY15 + ((t >> 3) & 0xfc) * GY15 + ((t >> 8) & 0xf8) * RY15, gray_shift);
            }
        };

@ -414,7 +452,7 @@ namespace cv { namespace cuda { namespace device
        {
            static __device__ __forceinline__ uchar cvt(uint t)
            {
-                return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 2) & 0xf8) * G2Y + ((t >> 7) & 0xf8) * R2Y, yuv_shift);
+                return (uchar)CV_DESCALE(((t << 3) & 0xf8) * BY15 + ((t >> 2) & 0xf8) * GY15 + ((t >> 7) & 0xf8) * RY15, gray_shift);
            }
        };

@ -443,7 +481,7 @@ namespace cv { namespace cuda { namespace device
    {
        template <int bidx, typename T> static __device__ __forceinline__ T RGB2GrayConvert(const T* src)
        {
-            return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);
+            return (T)CV_DESCALE((unsigned)(src[bidx] * BY15 + src[1] * GY15 + src[bidx^2] * RY15), gray_shift);
        }

        template <int bidx> static __device__ __forceinline__ uchar RGB2GrayConvert(uint src)
@ -451,12 +489,12 @@ namespace cv { namespace cuda { namespace device
            uint b = 0xffu & (src >> (bidx * 8));
            uint g = 0xffu & (src >> 8);
            uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
-            return CV_DESCALE((uint)(b * B2Y + g * G2Y + r * R2Y), yuv_shift);
+            return CV_DESCALE((uint)(b * BY15 + g * GY15 + r * RY15), gray_shift);
        }

        template <int bidx> static __device__ __forceinline__ float RGB2GrayConvert(const float* src)
        {
-            return src[bidx] * 0.114f + src[1] * 0.587f + src[bidx^2] * 0.299f;
+            return src[bidx] * B2YF + src[1] * G2YF + src[bidx^2] * R2YF;
        }

        template <typename T, int scn, int bidx> struct RGB2Gray : unary_function<typename TypeVec<T, scn>::vec_type, T>
@ -494,8 +532,8 @@ namespace cv { namespace cuda { namespace device

    namespace color_detail
    {
-        __constant__ float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
-        __constant__ int   c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };
+        __constant__ float c_RGB2YUVCoeffs_f[5] = { B2YF, G2YF, R2YF, B2UF, R2VF };
+        __constant__ int   c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, B2UI, R2VI };

        template <int bidx, typename T, typename D> static __device__ void RGB2YUVConvert(const T* src, D& dst)
        {
@ -543,8 +581,8 @@ namespace cv { namespace cuda { namespace device

    namespace color_detail
    {
-        __constant__ float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };
-        __constant__ int   c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 };
+        __constant__ float c_YUV2RGBCoeffs_f[5] = { U2BF, U2GF, V2GF, V2RF };
+        __constant__ int   c_YUV2RGBCoeffs_i[5] = { U2BI, U2GI, V2GI, V2RI };

        template <int bidx, typename T, typename D> static __device__ void YUV2RGBConvert(const T& src, D* dst)
        {
@ -633,8 +671,8 @@ namespace cv { namespace cuda { namespace device

    namespace color_detail
    {
-        __constant__ float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
-        __constant__ int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
+        __constant__ float c_RGB2YCrCbCoeffs_f[5] = {R2YF, G2YF, B2YF, YCRF, YCBF};
+        __constant__ int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, YCRI, YCBI};

        template <int bidx, typename T, typename D> static __device__ void RGB2YCrCbConvert(const T* src, D& dst)
        {
@ -710,8 +748,8 @@ namespace cv { namespace cuda { namespace device

    namespace color_detail
    {
-        __constant__ float c_YCrCb2RGBCoeffs_f[5] = {1.403f, -0.714f, -0.344f, 1.773f};
-        __constant__ int   c_YCrCb2RGBCoeffs_i[5] = {22987, -11698, -5636, 29049};
+        __constant__ float c_YCrCb2RGBCoeffs_f[5] = {CR2RF, CR2GF, CB2GF, CB2BF};
+        __constant__ int   c_YCrCb2RGBCoeffs_i[5] = {CR2RI, CR2GI, CB2GI, CB2BI};

        template <int bidx, typename T, typename D> static __device__ void YCrCb2RGBConvert(const T& src, D* dst)
        {
--- a/modules/imgproc/src/color_rgb.cpp
+++ b/modules/imgproc/src/color_rgb.cpp
--- a/modules/imgproc/src/opencl/color_rgb.cl
+++ b/modules/imgproc/src/opencl/color_rgb.cl
@ -76,10 +76,6 @@
 enum
 {
    gray_shift = 15,
-    yuv_shift  = 14,
-    R2Y        = 4899,
-    G2Y        = 9617,
-    B2Y        = 1868,
    RY15 = 9798,  // == R2YF*32768 + 0.5
    GY15 = 19235, // == G2YF*32768 + 0.5
    BY15 = 3735   // == B2YF*32768 + 0.5
@ -133,10 +129,8 @@ __kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offs
                DATA_TYPE_3 src_pix = vload3(0, src);
 #ifdef DEPTH_5
                dst[0] = fma(src_pix.B_COMP, B2YF, fma(src_pix.G_COMP, G2YF, src_pix.R_COMP * R2YF));
-#elif defined(DEPTH_0)
-                dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, BY15, mad24(src_pix.G_COMP, GY15, mul24(src_pix.R_COMP, RY15))), gray_shift);
 #else
-                dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, mul24(src_pix.R_COMP, R2Y))), yuv_shift);
+                dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, BY15, mad24(src_pix.G_COMP, GY15, mul24(src_pix.R_COMP, RY15))), gray_shift);
 #endif
                ++y;
                src_index += src_step;
@ -340,9 +334,9 @@ __kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offse
                int t = *((__global const ushort*)(src + src_index));

 #if greenbits == 6
-                dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 3) & 0xfc, G2Y, ((t >> 8) & 0xf8) * R2Y)), yuv_shift);
+                dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, BY15, mad24((t >> 3) & 0xfc, GY15, ((t >> 8) & 0xf8) * RY15)), gray_shift);
 #else
-                dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 2) & 0xf8, G2Y, ((t >> 7) & 0xf8) * R2Y)), yuv_shift);
+                dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, BY15, mad24((t >> 2) & 0xf8, GY15, ((t >> 7) & 0xf8) * RY15)), gray_shift);
 #endif
                ++y;
                dst_index += dst_step;
@ -445,9 +439,10 @@ __kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset
                    *(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0);
                else
                    *(__global uchar4 *)(dst + dst_index) =
-                        (uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3,
-                                 mad24(src_pix.y, MAX_NUM, v3_half) / v3,
-                                 mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3);
+                        (uchar4)(SAT_CAST(mad24(src_pix.x, MAX_NUM, v3_half) / v3),
+                                 SAT_CAST(mad24(src_pix.y, MAX_NUM, v3_half) / v3),
+                                 SAT_CAST(mad24(src_pix.z, MAX_NUM, v3_half) / v3),
+                                 SAT_CAST(v3));

                ++y;
                dst_index += dst_step;