mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
Merge pull request #19883 from jondea:arm-neon-optimised-color-lab-3.4
* Add Neon optimised RGB2Lab conversion * Fix compile errors, change lambda to macro * Change NEON optimised RGB2Lab to just use HAL * Change [] to v_extract_n in RGB2Lab * RGB2LAB Code quality, change to nlane agnostic * Change RGB2Lab to use function rather than macro * Remove whitespace Co-authored-by: Francesco Petrogalli <25690309+fpetrogalli@users.noreply.github.com>
This commit is contained in:
parent
63256a00ff
commit
8ecfbdb4ff
@ -1536,6 +1536,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
|
||||
#endif // CV_SIMD
|
||||
|
||||
|
||||
|
||||
|
||||
struct RGB2Lab_b
|
||||
{
|
||||
typedef uchar channel_type;
|
||||
@ -1571,6 +1573,69 @@ struct RGB2Lab_b
|
||||
}
|
||||
}
|
||||
|
||||
#if CV_NEON
|
||||
template <int n>
|
||||
inline void rgb2lab_batch(const ushort* tab,
|
||||
const v_uint8 vRi, const v_uint8 vGi, const v_uint8 vBi,
|
||||
v_int32& vL, v_int32& va, v_int32& vb) const
|
||||
{
|
||||
// Define some scalar constants which we will make use of later
|
||||
const int Lscale = (116*255+50)/100;
|
||||
const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
|
||||
const int xyzDescaleShift = (1 << (lab_shift - 1));
|
||||
const int labDescaleShift = (1 << (lab_shift2 - 1));
|
||||
const int abShift = 128*(1 << lab_shift2);
|
||||
|
||||
const int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
|
||||
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
|
||||
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
|
||||
|
||||
// int R = tab[src[0]], G = tab[src[1]], B = tab[src[2]];
|
||||
v_int32 vR(tab[v_extract_n<4*n+0>(vRi)], tab[v_extract_n<4*n+1>(vRi)],
|
||||
tab[v_extract_n<4*n+2>(vRi)], tab[v_extract_n<4*n+3>(vRi)]);
|
||||
v_int32 vG(tab[v_extract_n<4*n+0>(vGi)], tab[v_extract_n<4*n+1>(vGi)],
|
||||
tab[v_extract_n<4*n+2>(vGi)], tab[v_extract_n<4*n+3>(vGi)]);
|
||||
v_int32 vB(tab[v_extract_n<4*n+0>(vBi)], tab[v_extract_n<4*n+1>(vBi)],
|
||||
tab[v_extract_n<4*n+2>(vBi)], tab[v_extract_n<4*n+3>(vBi)]);
|
||||
|
||||
/* int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];*/
|
||||
v_int32 vfX = v_fma(vR, v_setall_s32(C0), v_setall_s32(xyzDescaleShift));
|
||||
vfX = v_fma(vG, v_setall_s32(C1), vfX);
|
||||
vfX = v_fma(vB, v_setall_s32(C2), vfX);
|
||||
vfX = v_shr<lab_shift>(vfX);
|
||||
vfX = v_int32(LabCbrtTab_b[v_extract_n<0>(vfX)], LabCbrtTab_b[v_extract_n<1>(vfX)],
|
||||
LabCbrtTab_b[v_extract_n<2>(vfX)], LabCbrtTab_b[v_extract_n<3>(vfX)]);
|
||||
|
||||
/* int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)]; */
|
||||
v_int32 vfY = v_fma(vR, v_setall_s32(C3), v_setall_s32(xyzDescaleShift));
|
||||
vfY = v_fma(vG, v_setall_s32(C4), vfY);
|
||||
vfY = v_fma(vB, v_setall_s32(C5), vfY);
|
||||
vfY = v_shr<lab_shift>(vfY);
|
||||
vfY = v_int32(LabCbrtTab_b[v_extract_n<0>(vfY)], LabCbrtTab_b[v_extract_n<1>(vfY)],
|
||||
LabCbrtTab_b[v_extract_n<2>(vfY)], LabCbrtTab_b[v_extract_n<3>(vfY)]);
|
||||
|
||||
/* int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];*/
|
||||
v_int32 vfZ = v_fma(vR, v_setall_s32(C6), v_setall_s32(xyzDescaleShift));
|
||||
vfZ = v_fma(vG, v_setall_s32(C7), vfZ);
|
||||
vfZ = v_fma(vB, v_setall_s32(C8), vfZ);
|
||||
vfZ = v_shr<lab_shift>(vfZ);
|
||||
vfZ = v_int32(LabCbrtTab_b[v_extract_n<0>(vfZ)], LabCbrtTab_b[v_extract_n<1>(vfZ)],
|
||||
LabCbrtTab_b[v_extract_n<2>(vfZ)], LabCbrtTab_b[v_extract_n<3>(vfZ)]);
|
||||
|
||||
/* int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );*/
|
||||
vL = v_fma(vfY, v_setall_s32(Lscale), v_setall_s32(Lshift+labDescaleShift));
|
||||
vL = v_shr<lab_shift2>(vL);
|
||||
|
||||
/* int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );*/
|
||||
va = v_fma(vfX - vfY, v_setall_s32(500), v_setall_s32(abShift+labDescaleShift));
|
||||
va = v_shr<lab_shift2>(va);
|
||||
|
||||
/* int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );*/
|
||||
vb = v_fma(vfY - vfZ, v_setall_s32(200), v_setall_s32(abShift+labDescaleShift));
|
||||
vb = v_shr<lab_shift2>(vb);
|
||||
}
|
||||
#endif // CV_NEON
|
||||
|
||||
void operator()(const uchar* src, uchar* dst, int n) const
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
@ -1585,6 +1650,45 @@ struct RGB2Lab_b
|
||||
|
||||
i = 0;
|
||||
|
||||
#if CV_NEON
|
||||
// On each loop, we load nlanes of RGB/A v_uint8s and store nlanes of
|
||||
// Lab v_uint8s
|
||||
for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes,
|
||||
src += scn*v_uint8::nlanes, dst += 3*v_uint8::nlanes )
|
||||
{
|
||||
// Load 4 batches of 4 src
|
||||
v_uint8 vRi, vGi, vBi;
|
||||
if(scn == 4)
|
||||
{
|
||||
v_uint8 vAi;
|
||||
v_load_deinterleave(src, vRi, vGi, vBi, vAi);
|
||||
}
|
||||
else // scn == 3
|
||||
{
|
||||
v_load_deinterleave(src, vRi, vGi, vBi);
|
||||
}
|
||||
|
||||
// Do 4 batches of 4 RGB2Labs
|
||||
v_int32 vL0, va0, vb0;
|
||||
rgb2lab_batch<0>(tab, vRi, vGi, vBi, vL0, va0, vb0);
|
||||
v_int32 vL1, va1, vb1;
|
||||
rgb2lab_batch<1>(tab, vRi, vGi, vBi, vL1, va1, vb1);
|
||||
v_int32 vL2, va2, vb2;
|
||||
rgb2lab_batch<2>(tab, vRi, vGi, vBi, vL2, va2, vb2);
|
||||
v_int32 vL3, va3, vb3;
|
||||
rgb2lab_batch<3>(tab, vRi, vGi, vBi, vL3, va3, vb3);
|
||||
|
||||
// Saturate, combine and store all batches
|
||||
// dst[0] = saturate_cast<uchar>(L);
|
||||
// dst[1] = saturate_cast<uchar>(a);
|
||||
// dst[2] = saturate_cast<uchar>(b);
|
||||
v_store_interleave(dst,
|
||||
v_pack(v_pack_u(vL0, vL1), v_pack_u(vL2, vL3)),
|
||||
v_pack(v_pack_u(va0, va1), v_pack_u(va2, va3)),
|
||||
v_pack(v_pack_u(vb0, vb1), v_pack_u(vb2, vb3)));
|
||||
}
|
||||
#endif // CV_NEON
|
||||
|
||||
#if CV_SIMD
|
||||
const int vsize = v_uint8::nlanes;
|
||||
const int xyzDescaleShift = 1 << (lab_shift - 1);
|
||||
|
Loading…
Reference in New Issue
Block a user