Merge pull request #8967 from savuor:rgb2lab_faster

2025-08-06 14:36:36 +08:00 · 2017-07-17 19:55:19 +00:00 · 2017-07-17 19:55:19 +00:00 · 6ba22dea46
commit 6ba22dea46
parent e5ed9cc612 70b984434d
2 changed files with 364 additions and 17 deletions
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@ -140,9 +140,6 @@ const int CB2GI = -5636;
 const int CR2GI = -11698;
 const int CR2RI = 22987;

-static const double _1_3 = 0.333333333333;
-static const float _1_3f = static_cast<float>(_1_3);
-
 // computes cubic spline coefficients for a function: (xi=i, yi=f[i]), i=0..n
 template<typename _Tp> static void splineBuild(const _Tp* f, int n, _Tp* tab)
 {
@ -5811,6 +5808,7 @@ struct HLS2RGB_b
    #endif
 };

+
 ///////////////////////////////////// RGB <-> L*a*b* /////////////////////////////////////

 static const float D65[] = { 0.950456f, 1.f, 1.088754f };
@ -5833,12 +5831,19 @@ static ushort sRGBInvGammaTab_b[INV_GAMMA_TAB_SIZE], linearInvGammaTab_b[INV_GAM
 static ushort LabCbrtTab_b[LAB_CBRT_TAB_SIZE_B];

 static const bool enableBitExactness = true;
+static const bool enableRGB2LabInterpolation = true;
 static const bool enablePackedLab = true;
 enum
 {
+    lab_lut_shift = 5,
+    LAB_LUT_DIM = (1 << lab_lut_shift)+1,
    lab_base_shift = 14,
    LAB_BASE = (1 << lab_base_shift),
+    trilinear_shift = 8 - lab_lut_shift + 1,
+    TRILINEAR_BASE = (1 << trilinear_shift)
 };
+static int16_t RGB2LabLUT_s16[LAB_LUT_DIM*LAB_LUT_DIM*LAB_LUT_DIM*3*8];
+static int16_t trilinearLUT[TRILINEAR_BASE*TRILINEAR_BASE*TRILINEAR_BASE*8];
 static ushort LabToYF_b[256*2];
 static const int minABvalue = -8145;
 static int abToXZ_b[LAB_BASE*9/4];
@ -5971,11 +5976,243 @@ static void initLabTabs()
            abToXZ_b[i-minABvalue] = v; // -1335 <= v <= 88231
        }

+        if(enableRGB2LabInterpolation)
+        {
+            const float* _whitept = D65;
+            softfloat coeffs[9];
+
+            //RGB2Lab coeffs
+            softfloat scaleWhite[] = { softfloat::one()/softfloat(_whitept[0]),
+                                       softfloat::one(),
+                                       softfloat::one()/softfloat(_whitept[2]) };
+
+            for(i = 0; i < 3; i++ )
+            {
+                int j = i * 3;
+                coeffs[j + 2] = scaleWhite[i] * softfloat(sRGB2XYZ_D65[j    ]);
+                coeffs[j + 1] = scaleWhite[i] * softfloat(sRGB2XYZ_D65[j + 1]);
+                coeffs[j + 0] = scaleWhite[i] * softfloat(sRGB2XYZ_D65[j + 2]);
+            }
+
+            softfloat D0 = coeffs[0], D1 = coeffs[1], D2 = coeffs[2],
+                      D3 = coeffs[3], D4 = coeffs[4], D5 = coeffs[5],
+                      D6 = coeffs[6], D7 = coeffs[7], D8 = coeffs[8];
+
+            //903.3f = (29/3)^3
+            static const softfloat lld(LAB_LUT_DIM - 1), f116(116), f16(16), f500(500), f200(200);
+            static const softfloat f100(100), f128(128), f256(256), lbase((int)LAB_BASE);
+            static const softfloat f9033 = softfloat(29*29*29)/softfloat(27);
+            AutoBuffer<int16_t> RGB2Labprev(LAB_LUT_DIM*LAB_LUT_DIM*LAB_LUT_DIM*3);
+            for(int p = 0; p < LAB_LUT_DIM; p++)
+            {
+                for(int q = 0; q < LAB_LUT_DIM; q++)
+                {
+                    for(int r = 0; r < LAB_LUT_DIM; r++)
+                    {
+                        //RGB 2 Lab LUT building
+                        softfloat R = softfloat(p)/lld;
+                        softfloat G = softfloat(q)/lld;
+                        softfloat B = softfloat(r)/lld;
+
+                        R = applyGamma(R);
+                        G = applyGamma(G);
+                        B = applyGamma(B);
+
+                        softfloat X = R*D0 + G*D1 + B*D2;
+                        softfloat Y = R*D3 + G*D4 + B*D5;
+                        softfloat Z = R*D6 + G*D7 + B*D8;
+
+                        softfloat FX = X > lthresh ? cbrt(X) : mulAdd(X, lscale, lbias);
+                        softfloat FY = Y > lthresh ? cbrt(Y) : mulAdd(Y, lscale, lbias);
+                        softfloat FZ = Z > lthresh ? cbrt(Z) : mulAdd(Z, lscale, lbias);
+
+                        softfloat L = Y > lthresh ? (f116*FY - f16) : (f9033*Y);
+                        softfloat a = f500 * (FX - FY);
+                        softfloat b = f200 * (FY - FZ);
+
+                        int idx = p*3 + q*LAB_LUT_DIM*3 + r*LAB_LUT_DIM*LAB_LUT_DIM*3;
+                        RGB2Labprev[idx]   = (int16_t)(cvRound(lbase*L/f100));
+                        RGB2Labprev[idx+1] = (int16_t)(cvRound(lbase*(a + f128)/f256));
+                        RGB2Labprev[idx+2] = (int16_t)(cvRound(lbase*(b + f128)/f256));
+                    }
+                }
+            }
+            for(int p = 0; p < LAB_LUT_DIM; p++)
+            {
+                for(int q = 0; q < LAB_LUT_DIM; q++)
+                {
+                    for(int r = 0; r < LAB_LUT_DIM; r++)
+                    {
+                        #define FILL(_p, _q, _r) \
+                            do {\
+                                int idxold = 0;\
+                                idxold += min(p+(_p), (int)(LAB_LUT_DIM-1))*3;\
+                                idxold += min(q+(_q), (int)(LAB_LUT_DIM-1))*LAB_LUT_DIM*3;\
+                                idxold += min(r+(_r), (int)(LAB_LUT_DIM-1))*LAB_LUT_DIM*LAB_LUT_DIM*3;\
+                                int idxnew = p*3*8 + q*LAB_LUT_DIM*3*8 + r*LAB_LUT_DIM*LAB_LUT_DIM*3*8+4*(_p)+2*(_q)+(_r);\
+                                RGB2LabLUT_s16[idxnew]    = RGB2Labprev[idxold];\
+                                RGB2LabLUT_s16[idxnew+8]  = RGB2Labprev[idxold+1];\
+                                RGB2LabLUT_s16[idxnew+16] = RGB2Labprev[idxold+2];\
+                            } while(0)
+
+                        FILL(0, 0, 0); FILL(0, 0, 1);
+                        FILL(0, 1, 0); FILL(0, 1, 1);
+                        FILL(1, 0, 0); FILL(1, 0, 1);
+                        FILL(1, 1, 0); FILL(1, 1, 1);
+
+                        #undef FILL
+                    }
+                }
+            }
+
+            for(int16_t p = 0; p < TRILINEAR_BASE; p++)
+            {
+                int16_t pp = TRILINEAR_BASE - p;
+                for(int16_t q = 0; q < TRILINEAR_BASE; q++)
+                {
+                    int16_t qq = TRILINEAR_BASE - q;
+                    for(int16_t r = 0; r < TRILINEAR_BASE; r++)
+                    {
+                        int16_t rr = TRILINEAR_BASE - r;
+                        int16_t* w = &trilinearLUT[8*p + 8*TRILINEAR_BASE*q + 8*TRILINEAR_BASE*TRILINEAR_BASE*r];
+                        w[0]  = pp * qq * rr; w[1]  = pp * qq * r ; w[2]  = pp * q  * rr; w[3]  = pp * q  * r ;
+                        w[4]  = p  * qq * rr; w[5]  = p  * qq * r ; w[6]  = p  * q  * rr; w[7]  = p  * q  * r ;
+                    }
+                }
+            }
+        }
+
        initialized = true;
    }
 }


+// cx, cy, cz are in [0; LAB_BASE]
+static inline void trilinearInterpolate(int cx, int cy, int cz, int16_t* LUT,
+                                        int& a, int& b, int& c)
+{
+    //LUT idx of origin pt of cube
+    int tx = cx >> (lab_base_shift - lab_lut_shift);
+    int ty = cy >> (lab_base_shift - lab_lut_shift);
+    int tz = cz >> (lab_base_shift - lab_lut_shift);
+
+    int16_t* baseLUT = &LUT[3*8*tx + (3*8*LAB_LUT_DIM)*ty + (3*8*LAB_LUT_DIM*LAB_LUT_DIM)*tz];
+    int aa[8], bb[8], cc[8];
+    for(int i = 0; i < 8; i++)
+    {
+        aa[i] = baseLUT[i]; bb[i] = baseLUT[i+8]; cc[i] = baseLUT[i+16];
+    }
+
+    //x, y, z are [0; TRILINEAR_BASE)
+    static const int bitMask = (1 << trilinear_shift) - 1;
+    int x = (cx >> (lab_base_shift - 8 - 1)) & bitMask;
+    int y = (cy >> (lab_base_shift - 8 - 1)) & bitMask;
+    int z = (cz >> (lab_base_shift - 8 - 1)) & bitMask;
+
+    int w[8];
+    for(int i = 0; i < 8; i++)
+    {
+        w[i] = trilinearLUT[8*x + 8*TRILINEAR_BASE*y + 8*TRILINEAR_BASE*TRILINEAR_BASE*z + i];
+    }
+
+    a = aa[0]*w[0]+aa[1]*w[1]+aa[2]*w[2]+aa[3]*w[3]+aa[4]*w[4]+aa[5]*w[5]+aa[6]*w[6]+aa[7]*w[7];
+    b = bb[0]*w[0]+bb[1]*w[1]+bb[2]*w[2]+bb[3]*w[3]+bb[4]*w[4]+bb[5]*w[5]+bb[6]*w[6]+bb[7]*w[7];
+    c = cc[0]*w[0]+cc[1]*w[1]+cc[2]*w[2]+cc[3]*w[3]+cc[4]*w[4]+cc[5]*w[5]+cc[6]*w[6]+cc[7]*w[7];
+
+    a = CV_DESCALE(a, trilinear_shift*3);
+    b = CV_DESCALE(b, trilinear_shift*3);
+    c = CV_DESCALE(c, trilinear_shift*3);
+}
+
+
+// 8 inValues are in [0; LAB_BASE]
+static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uint16x8& inY, const v_uint16x8& inZ,
+                                              const int16_t* LUT,
+                                              v_uint16x8& outA, v_uint16x8& outB, v_uint16x8& outC)
+{
+    //LUT idx of origin pt of cube
+    v_uint16x8 idxsX = inX >> (lab_base_shift - lab_lut_shift);
+    v_uint16x8 idxsY = inY >> (lab_base_shift - lab_lut_shift);
+    v_uint16x8 idxsZ = inZ >> (lab_base_shift - lab_lut_shift);
+
+    //x, y, z are [0; TRILINEAR_BASE)
+    const uint16_t bitMask = (1 << trilinear_shift) - 1;
+    v_uint16x8 bitMaskReg = v_setall_u16(bitMask);
+    v_uint16x8 fracX = (inX >> (lab_base_shift - 8 - 1)) & bitMaskReg;
+    v_uint16x8 fracY = (inY >> (lab_base_shift - 8 - 1)) & bitMaskReg;
+    v_uint16x8 fracZ = (inZ >> (lab_base_shift - 8 - 1)) & bitMaskReg;
+
+    //load values to interpolate for pix0, pix1, .., pix7
+    v_int16x8 a0, a1, a2, a3, a4, a5, a6, a7;
+    v_int16x8 b0, b1, b2, b3, b4, b5, b6, b7;
+    v_int16x8 c0, c1, c2, c3, c4, c5, c6, c7;
+
+    v_uint32x4 addrDw0, addrDw1, addrDw10, addrDw11;
+    v_mul_expand(v_setall_u16(3*8), idxsX, addrDw0, addrDw1);
+    v_mul_expand(v_setall_u16(3*8*LAB_LUT_DIM), idxsY, addrDw10, addrDw11);
+    addrDw0 += addrDw10; addrDw1 += addrDw11;
+    v_mul_expand(v_setall_u16(3*8*LAB_LUT_DIM*LAB_LUT_DIM), idxsZ, addrDw10, addrDw11);
+    addrDw0 += addrDw10; addrDw1 += addrDw11;
+
+    uint32_t CV_DECL_ALIGNED(16) addrofs[8];
+    v_store_aligned(addrofs, addrDw0);
+    v_store_aligned(addrofs + 4, addrDw1);
+
+    const int16_t* ptr;
+#define LOAD_ABC(n) ptr = LUT + addrofs[n]; a##n = v_load(ptr); b##n = v_load(ptr + 8); c##n = v_load(ptr + 16)
+    LOAD_ABC(0);
+    LOAD_ABC(1);
+    LOAD_ABC(2);
+    LOAD_ABC(3);
+    LOAD_ABC(4);
+    LOAD_ABC(5);
+    LOAD_ABC(6);
+    LOAD_ABC(7);
+#undef LOAD_ABC
+
+    //interpolation weights for pix0, pix1, .., pix7
+    v_int16x8 w0, w1, w2, w3, w4, w5, w6, w7;
+    v_mul_expand(v_setall_u16(8), fracX, addrDw0, addrDw1);
+    v_mul_expand(v_setall_u16(8*TRILINEAR_BASE), fracY, addrDw10, addrDw11);
+    addrDw0 += addrDw10; addrDw1 += addrDw11;
+    v_mul_expand(v_setall_u16(8*TRILINEAR_BASE*TRILINEAR_BASE), fracZ, addrDw10, addrDw11);
+    addrDw0 += addrDw10; addrDw1 += addrDw11;
+
+    v_store_aligned(addrofs, addrDw0);
+    v_store_aligned(addrofs + 4, addrDw1);
+
+#define LOAD_W(n) ptr = trilinearLUT + addrofs[n]; w##n = v_load(ptr)
+    LOAD_W(0);
+    LOAD_W(1);
+    LOAD_W(2);
+    LOAD_W(3);
+    LOAD_W(4);
+    LOAD_W(5);
+    LOAD_W(6);
+    LOAD_W(7);
+#undef LOAD_W
+
+    //outA = descale(v_reg<8>(sum(dot(ai, wi))))
+    v_uint32x4 part0, part1;
+#define DOT_SHIFT_PACK(l, ll) \
+    part0 = v_uint32x4(v_reduce_sum(v_dotprod(l##0, w0)),\
+                       v_reduce_sum(v_dotprod(l##1, w1)),\
+                       v_reduce_sum(v_dotprod(l##2, w2)),\
+                       v_reduce_sum(v_dotprod(l##3, w3)));\
+    part1 = v_uint32x4(v_reduce_sum(v_dotprod(l##4, w4)),\
+                       v_reduce_sum(v_dotprod(l##5, w5)),\
+                       v_reduce_sum(v_dotprod(l##6, w6)),\
+                       v_reduce_sum(v_dotprod(l##7, w7)));\
+    (ll) = v_rshr_pack<trilinear_shift*3>(part0, part1)
+
+    DOT_SHIFT_PACK(a, outA);
+    DOT_SHIFT_PACK(b, outB);
+    DOT_SHIFT_PACK(c, outC);
+
+#undef DOT_SHIFT_PACK
+}
+
+
 struct RGB2Lab_b
 {
    typedef uchar channel_type;
@ -6043,13 +6280,15 @@ struct RGB2Lab_f
 {
    typedef float channel_type;

-    RGB2Lab_f(int _srccn, int blueIdx, const float* _coeffs,
+    RGB2Lab_f(int _srccn, int _blueIdx, const float* _coeffs,
              const float* _whitept, bool _srgb)
-    : srccn(_srccn), srgb(_srgb)
+    : srccn(_srccn), srgb(_srgb), blueIdx(_blueIdx)
    {
        volatile int _3 = 3;
        initLabTabs();

+        useInterpolation = (!_coeffs && !_whitept && srgb && enableRGB2LabInterpolation);
+
        if (!_coeffs)
            _coeffs = sRGB2XYZ_D65;
        if (!_whitept)
@ -6076,7 +6315,7 @@ struct RGB2Lab_f

    void operator()(const float* src, float* dst, int n) const
    {
-        int i, scn = srccn;
+        int i, scn = srccn, bIdx = blueIdx;
        float gscale = GammaTabScale;
        const float* gammaTab = srgb ? sRGBGammaTab : 0;
        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
@ -6084,8 +6323,112 @@ struct RGB2Lab_f
              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
        n *= 3;

+        i = 0;
+        if(useInterpolation)
+        {
+            if(enablePackedLab)
+            {
+                static const int nPixels = 4*2;
+                for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
+                {
+                    v_float32x4 rvec0, gvec0, bvec0, rvec1, gvec1, bvec1;
+                    v_float32x4 dummy0, dummy1;
+                    if(scn == 3)
+                    {
+                        v_load_deinterleave(src, rvec0, gvec0, bvec0);
+                        v_load_deinterleave(src + scn*4, rvec1, gvec1, bvec1);
+                    }
+                    else // scn == 4
+                    {
+                        v_load_deinterleave(src, rvec0, gvec0, bvec0, dummy0);
+                        v_load_deinterleave(src + scn*4, rvec1, gvec1, bvec1, dummy1);
+                    }
+
+                    if(bIdx)
+                    {
+                        dummy0 = rvec0; rvec0 = bvec0; bvec0 = dummy0;
+                        dummy1 = rvec1; rvec1 = bvec1; bvec1 = dummy1;
+                    }
+
+                    v_float32x4 zerof = v_setzero_f32(), onef = v_setall_f32(1.0f);
+                    /* clip() */
+                    #define clipv(r) (r) = v_min(v_max((r), zerof), onef)
+                    clipv(rvec0); clipv(rvec1);
+                    clipv(gvec0); clipv(gvec1);
+                    clipv(bvec0); clipv(bvec1);
+                    #undef clipv
+                    /* int iR = R*LAB_BASE, iG = G*LAB_BASE, iB = B*LAB_BASE, iL, ia, ib; */
+                    v_float32x4 basef = v_setall_f32(LAB_BASE);
+                    rvec0 *= basef, gvec0 *= basef, bvec0 *= basef;
+                    rvec1 *= basef, gvec1 *= basef, bvec1 *= basef;
+
+                    v_int32x4 irvec0, igvec0, ibvec0, irvec1, igvec1, ibvec1;
+                    irvec0 = v_round(rvec0); irvec1 = v_round(rvec1);
+                    igvec0 = v_round(gvec0); igvec1 = v_round(gvec1);
+                    ibvec0 = v_round(bvec0); ibvec1 = v_round(bvec1);
+
+                    v_int16x8 irvec, igvec, ibvec;
+                    irvec = v_pack(irvec0, irvec1);
+                    igvec = v_pack(igvec0, igvec1);
+                    ibvec = v_pack(ibvec0, ibvec1);
+
+                    v_uint16x8 uirvec = v_reinterpret_as_u16(irvec);
+                    v_uint16x8 uigvec = v_reinterpret_as_u16(igvec);
+                    v_uint16x8 uibvec = v_reinterpret_as_u16(ibvec);
+
+                    v_uint16x8 ui_lvec, ui_avec, ui_bvec;
+                    trilinearPackedInterpolate(uirvec, uigvec, uibvec, RGB2LabLUT_s16, ui_lvec, ui_avec, ui_bvec);
+                    v_int16x8 i_lvec = v_reinterpret_as_s16(ui_lvec);
+                    v_int16x8 i_avec = v_reinterpret_as_s16(ui_avec);
+                    v_int16x8 i_bvec = v_reinterpret_as_s16(ui_bvec);
+
+                    /* float L = iL*1.0f/LAB_BASE, a = ia*1.0f/LAB_BASE, b = ib*1.0f/LAB_BASE; */
+                    v_int32x4 i_lvec0, i_avec0, i_bvec0, i_lvec1, i_avec1, i_bvec1;
+                    v_expand(i_lvec, i_lvec0, i_lvec1);
+                    v_expand(i_avec, i_avec0, i_avec1);
+                    v_expand(i_bvec, i_bvec0, i_bvec1);
+
+                    v_float32x4 l_vec0, a_vec0, b_vec0, l_vec1, a_vec1, b_vec1;
+                    l_vec0 = v_cvt_f32(i_lvec0); l_vec1 = v_cvt_f32(i_lvec1);
+                    a_vec0 = v_cvt_f32(i_avec0); a_vec1 = v_cvt_f32(i_avec1);
+                    b_vec0 = v_cvt_f32(i_bvec0); b_vec1 = v_cvt_f32(i_bvec1);
+
+                    /* dst[i] = L*100.0f */
+                    l_vec0 = l_vec0*v_setall_f32(100.0f/LAB_BASE);
+                    l_vec1 = l_vec1*v_setall_f32(100.0f/LAB_BASE);
+                    /*
+                    dst[i + 1] = a*256.0f - 128.0f;
+                    dst[i + 2] = b*256.0f - 128.0f;
+                    */
+                    a_vec0 = a_vec0*v_setall_f32(256.0f/LAB_BASE) - v_setall_f32(128.0f);
+                    a_vec1 = a_vec1*v_setall_f32(256.0f/LAB_BASE) - v_setall_f32(128.0f);
+                    b_vec0 = b_vec0*v_setall_f32(256.0f/LAB_BASE) - v_setall_f32(128.0f);
+                    b_vec1 = b_vec1*v_setall_f32(256.0f/LAB_BASE) - v_setall_f32(128.0f);
+
+                    v_store_interleave(dst + i, l_vec0, a_vec0, b_vec0);
+                    v_store_interleave(dst + i + 3*4, l_vec1, a_vec1, b_vec1);
+                }
+            }
+
+            for(; i < n; i += 3, src += scn)
+            {
+                float R = clip(src[bIdx]);
+                float G = clip(src[1]);
+                float B = clip(src[bIdx^2]);
+
+                int iR = cvRound(R*LAB_BASE), iG = cvRound(G*LAB_BASE), iB = cvRound(B*LAB_BASE);
+                int iL, ia, ib;
+                trilinearInterpolate(iR, iG, iB, RGB2LabLUT_s16, iL, ia, ib);
+                float L = iL*1.0f/LAB_BASE, a = ia*1.0f/LAB_BASE, b = ib*1.0f/LAB_BASE;
+
+                dst[i] = L*100.0f;
+                dst[i + 1] = a*256.0f - 128.0f;
+                dst[i + 2] = b*256.0f - 128.0f;
+            }
+        }
+
        static const float _a = (softfloat(16) / softfloat(116));
-        for (i = 0; i < n; i += 3, src += scn )
+        for (; i < n; i += 3, src += scn )
        {
            float R = clip(src[0]);
            float G = clip(src[1]);
@ -6101,9 +6444,9 @@ struct RGB2Lab_f
            float Y = R*C3 + G*C4 + B*C5;
            float Z = R*C6 + G*C7 + B*C8;
            // 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
-            float FX = X > 0.008856f ? std::pow(X, _1_3f) : (7.787f * X + _a);
-            float FY = Y > 0.008856f ? std::pow(Y, _1_3f) : (7.787f * Y + _a);
-            float FZ = Z > 0.008856f ? std::pow(Z, _1_3f) : (7.787f * Z + _a);
+            float FX = X > 0.008856f ? cubeRoot(X) : (7.787f * X + _a);
+            float FY = Y > 0.008856f ? cubeRoot(Y) : (7.787f * Y + _a);
+            float FZ = Z > 0.008856f ? cubeRoot(Z) : (7.787f * Z + _a);

            float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
            float a = 500.f * (FX - FY);
@ -6118,6 +6461,8 @@ struct RGB2Lab_f
    int srccn;
    float coeffs[9];
    bool srgb;
+    bool useInterpolation;
+    int blueIdx;
 };


@ -9123,7 +9468,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
                Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs);
            }

-            float _a = softfloat(16)/softfloat(116);
+            static const float _a = softfloat(16)/softfloat(116);
+            static const float _1_3f = softfloat::one()/softfloat(3);
            ocl::KernelArg ucoeffsarg = ocl::KernelArg::PtrReadOnly(ucoeffs);

            if (lab)
--- a/modules/photo/test/test_npr.cpp
+++ b/modules/photo/test/test_npr.cpp
@ -62,8 +62,9 @@ TEST(Photo_NPR_EdgePreserveSmoothing_RecursiveFilter, regression)
    edgePreservingFilter(source,result,1);

    Mat reference = imread(folder + "smoothened_RF_reference.png");
-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);
+
+    double psnr = cvtest::PSNR(reference, result);
+    EXPECT_GT(psnr, 60.0);
 }

 TEST(Photo_NPR_EdgePreserveSmoothing_NormConvFilter, regression)
@ -79,9 +80,9 @@ TEST(Photo_NPR_EdgePreserveSmoothing_NormConvFilter, regression)
    edgePreservingFilter(source,result,2);

    Mat reference = imread(folder + "smoothened_NCF_reference.png");
-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);

+    double psnr = cvtest::PSNR(reference, result);
+    EXPECT_GT(psnr, 60.0);
 }

 TEST(Photo_NPR_DetailEnhance, regression)
@ -97,8 +98,8 @@ TEST(Photo_NPR_DetailEnhance, regression)
    detailEnhance(source,result);

    Mat reference = imread(folder + "detail_enhanced_reference.png");
-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);
+    double psnr = cvtest::PSNR(reference, result);
+    EXPECT_GT(psnr, 60.0);
 }

 TEST(Photo_NPR_PencilSketch, regression)