diff --git a/modules/imgproc/src/color_lab.cpp b/modules/imgproc/src/color_lab.cpp index 337d601f69..a181880862 100644 --- a/modules/imgproc/src/color_lab.cpp +++ b/modules/imgproc/src/color_lab.cpp @@ -1536,6 +1536,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1 #endif // CV_SIMD + + struct RGB2Lab_b { typedef uchar channel_type; @@ -1571,6 +1573,69 @@ struct RGB2Lab_b } } +#if CV_NEON + template + inline void rgb2lab_batch(const ushort* tab, + const v_uint8 vRi, const v_uint8 vGi, const v_uint8 vBi, + v_int32& vL, v_int32& va, v_int32& vb) const + { + // Define some scalar constants which we will make use of later + const int Lscale = (116*255+50)/100; + const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100); + const int xyzDescaleShift = (1 << (lab_shift - 1)); + const int labDescaleShift = (1 << (lab_shift2 - 1)); + const int abShift = 128*(1 << lab_shift2); + + const int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + + // int R = tab[src[0]], G = tab[src[1]], B = tab[src[2]]; + v_int32 vR(tab[v_extract_n<4*n+0>(vRi)], tab[v_extract_n<4*n+1>(vRi)], + tab[v_extract_n<4*n+2>(vRi)], tab[v_extract_n<4*n+3>(vRi)]); + v_int32 vG(tab[v_extract_n<4*n+0>(vGi)], tab[v_extract_n<4*n+1>(vGi)], + tab[v_extract_n<4*n+2>(vGi)], tab[v_extract_n<4*n+3>(vGi)]); + v_int32 vB(tab[v_extract_n<4*n+0>(vBi)], tab[v_extract_n<4*n+1>(vBi)], + tab[v_extract_n<4*n+2>(vBi)], tab[v_extract_n<4*n+3>(vBi)]); + + /* int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];*/ + v_int32 vfX = v_fma(vR, v_setall_s32(C0), v_setall_s32(xyzDescaleShift)); + vfX = v_fma(vG, v_setall_s32(C1), vfX); + vfX = v_fma(vB, v_setall_s32(C2), vfX); + vfX = v_shr(vfX); + vfX = v_int32(LabCbrtTab_b[v_extract_n<0>(vfX)], LabCbrtTab_b[v_extract_n<1>(vfX)], + LabCbrtTab_b[v_extract_n<2>(vfX)], LabCbrtTab_b[v_extract_n<3>(vfX)]); + + /* int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)]; */ + v_int32 vfY = v_fma(vR, v_setall_s32(C3), v_setall_s32(xyzDescaleShift)); + vfY = v_fma(vG, v_setall_s32(C4), vfY); + vfY = v_fma(vB, v_setall_s32(C5), vfY); + vfY = v_shr(vfY); + vfY = v_int32(LabCbrtTab_b[v_extract_n<0>(vfY)], LabCbrtTab_b[v_extract_n<1>(vfY)], + LabCbrtTab_b[v_extract_n<2>(vfY)], LabCbrtTab_b[v_extract_n<3>(vfY)]); + + /* int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];*/ + v_int32 vfZ = v_fma(vR, v_setall_s32(C6), v_setall_s32(xyzDescaleShift)); + vfZ = v_fma(vG, v_setall_s32(C7), vfZ); + vfZ = v_fma(vB, v_setall_s32(C8), vfZ); + vfZ = v_shr(vfZ); + vfZ = v_int32(LabCbrtTab_b[v_extract_n<0>(vfZ)], LabCbrtTab_b[v_extract_n<1>(vfZ)], + LabCbrtTab_b[v_extract_n<2>(vfZ)], LabCbrtTab_b[v_extract_n<3>(vfZ)]); + + /* int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );*/ + vL = v_fma(vfY, v_setall_s32(Lscale), v_setall_s32(Lshift+labDescaleShift)); + vL = v_shr(vL); + + /* int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );*/ + va = v_fma(vfX - vfY, v_setall_s32(500), v_setall_s32(abShift+labDescaleShift)); + va = v_shr(va); + + /* int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );*/ + vb = v_fma(vfY - vfZ, v_setall_s32(200), v_setall_s32(abShift+labDescaleShift)); + vb = v_shr(vb); + } +#endif // CV_NEON + void operator()(const uchar* src, uchar* dst, int n) const { CV_INSTRUMENT_REGION(); @@ -1585,6 +1650,45 @@ struct RGB2Lab_b i = 0; +#if CV_NEON + // On each loop, we load nlanes of RGB/A v_uint8s and store nlanes of + // Lab v_uint8s + for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes, + src += scn*v_uint8::nlanes, dst += 3*v_uint8::nlanes ) + { + // Load 4 batches of 4 src + v_uint8 vRi, vGi, vBi; + if(scn == 4) + { + v_uint8 vAi; + v_load_deinterleave(src, vRi, vGi, vBi, vAi); + } + else // scn == 3 + { + v_load_deinterleave(src, vRi, vGi, vBi); + } + + // Do 4 batches of 4 RGB2Labs + v_int32 vL0, va0, vb0; + rgb2lab_batch<0>(tab, vRi, vGi, vBi, vL0, va0, vb0); + v_int32 vL1, va1, vb1; + rgb2lab_batch<1>(tab, vRi, vGi, vBi, vL1, va1, vb1); + v_int32 vL2, va2, vb2; + rgb2lab_batch<2>(tab, vRi, vGi, vBi, vL2, va2, vb2); + v_int32 vL3, va3, vb3; + rgb2lab_batch<3>(tab, vRi, vGi, vBi, vL3, va3, vb3); + + // Saturate, combine and store all batches + // dst[0] = saturate_cast(L); + // dst[1] = saturate_cast(a); + // dst[2] = saturate_cast(b); + v_store_interleave(dst, + v_pack(v_pack_u(vL0, vL1), v_pack_u(vL2, vL3)), + v_pack(v_pack_u(va0, va1), v_pack_u(va2, va3)), + v_pack(v_pack_u(vb0, vb1), v_pack_u(vb2, vb3))); + } +#endif // CV_NEON + #if CV_SIMD const int vsize = v_uint8::nlanes; const int xyzDescaleShift = 1 << (lab_shift - 1);