add SSE code for RGB2Luv_f.

This commit is contained in:
k-shinotsuka 2016-10-22 19:56:55 +09:00
parent dd379ec9fd
commit 992795d47d

View File

@ -141,6 +141,39 @@ template<typename _Tp> static inline _Tp splineInterpolate(_Tp x, const _Tp* tab
return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0];
}
#if CV_SSE2
template<typename _Tp> static inline void splineInterpolate(__m128& v_x, const _Tp* tab, int n)
{
__m128i v_ix = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v_x, _mm_setzero_ps()), _mm_set1_ps(float(n - 1))));
v_x = _mm_sub_ps(v_x, _mm_cvtepi32_ps(v_ix));
v_ix = _mm_slli_epi32(v_ix, 2);
int CV_DECL_ALIGNED(16) ix[4];
_mm_store_si128((__m128i *)ix, v_ix);
__m128 v_tab0 = _mm_loadu_ps(tab + ix[0]);
__m128 v_tab1 = _mm_loadu_ps(tab + ix[1]);
__m128 v_tab2 = _mm_loadu_ps(tab + ix[2]);
__m128 v_tab3 = _mm_loadu_ps(tab + ix[3]);
__m128 v_tmp0 = _mm_unpacklo_ps(v_tab0, v_tab1);
__m128 v_tmp1 = _mm_unpacklo_ps(v_tab2, v_tab3);
__m128 v_tmp2 = _mm_unpackhi_ps(v_tab0, v_tab1);
__m128 v_tmp3 = _mm_unpackhi_ps(v_tab2, v_tab3);
v_tab0 = _mm_shuffle_ps(v_tmp0, v_tmp1, 0x44);
v_tab2 = _mm_shuffle_ps(v_tmp2, v_tmp3, 0x44);
v_tab1 = _mm_shuffle_ps(v_tmp0, v_tmp1, 0xee);
v_tab3 = _mm_shuffle_ps(v_tmp2, v_tmp3, 0xee);
__m128 v_l = _mm_mul_ps(v_x, v_tab3);
v_l = _mm_add_ps(v_l, v_tab2);
v_l = _mm_mul_ps(v_l, v_x);
v_l = _mm_add_ps(v_l, v_tab1);
v_l = _mm_mul_ps(v_l, v_x);
v_x = _mm_add_ps(v_l, v_tab0);
}
#endif
template<typename _Tp> struct ColorChannel
{
@ -5766,24 +5799,146 @@ struct RGB2Luv_f
}
float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
un = 4*whitept[0]*d;
vn = 9*whitept[1]*d;
un = 4*whitept[0]*d*13;
vn = 9*whitept[1]*d*13;
#if CV_SSE2
haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
CV_Assert(whitept[1] == 1.f);
}
#if CV_SSE2
void process(__m128& v_r0, __m128& v_r1, __m128& v_g0,
__m128& v_g1, __m128& v_b0, __m128& v_b1) const
{
__m128 v_x0 = _mm_mul_ps(v_r0, _mm_set1_ps(coeffs[0]));
__m128 v_x1 = _mm_mul_ps(v_r1, _mm_set1_ps(coeffs[0]));
__m128 v_y0 = _mm_mul_ps(v_r0, _mm_set1_ps(coeffs[3]));
__m128 v_y1 = _mm_mul_ps(v_r1, _mm_set1_ps(coeffs[3]));
__m128 v_z0 = _mm_mul_ps(v_r0, _mm_set1_ps(coeffs[6]));
__m128 v_z1 = _mm_mul_ps(v_r1, _mm_set1_ps(coeffs[6]));
v_x0 = _mm_add_ps(v_x0, _mm_mul_ps(v_g0, _mm_set1_ps(coeffs[1])));
v_x1 = _mm_add_ps(v_x1, _mm_mul_ps(v_g1, _mm_set1_ps(coeffs[1])));
v_y0 = _mm_add_ps(v_y0, _mm_mul_ps(v_g0, _mm_set1_ps(coeffs[4])));
v_y1 = _mm_add_ps(v_y1, _mm_mul_ps(v_g1, _mm_set1_ps(coeffs[4])));
v_z0 = _mm_add_ps(v_z0, _mm_mul_ps(v_g0, _mm_set1_ps(coeffs[7])));
v_z1 = _mm_add_ps(v_z1, _mm_mul_ps(v_g1, _mm_set1_ps(coeffs[7])));
v_x0 = _mm_add_ps(v_x0, _mm_mul_ps(v_b0, _mm_set1_ps(coeffs[2])));
v_x1 = _mm_add_ps(v_x1, _mm_mul_ps(v_b1, _mm_set1_ps(coeffs[2])));
v_y0 = _mm_add_ps(v_y0, _mm_mul_ps(v_b0, _mm_set1_ps(coeffs[5])));
v_y1 = _mm_add_ps(v_y1, _mm_mul_ps(v_b1, _mm_set1_ps(coeffs[5])));
v_z0 = _mm_add_ps(v_z0, _mm_mul_ps(v_b0, _mm_set1_ps(coeffs[8])));
v_z1 = _mm_add_ps(v_z1, _mm_mul_ps(v_b1, _mm_set1_ps(coeffs[8])));
__m128 v_l0 = _mm_mul_ps(v_y0, _mm_set1_ps(LabCbrtTabScale));
__m128 v_l1 = _mm_mul_ps(v_y1, _mm_set1_ps(LabCbrtTabScale));
splineInterpolate(v_l0, LabCbrtTab, LAB_CBRT_TAB_SIZE);
splineInterpolate(v_l1, LabCbrtTab, LAB_CBRT_TAB_SIZE);
v_l0 = _mm_mul_ps(v_l0, _mm_set1_ps(116.0f));
v_l1 = _mm_mul_ps(v_l1, _mm_set1_ps(116.0f));
v_r0 = _mm_sub_ps(v_l0, _mm_set1_ps(16.0f));
v_r1 = _mm_sub_ps(v_l1, _mm_set1_ps(16.0f));
v_z0 = _mm_mul_ps(v_z0, _mm_set1_ps(3.0f));
v_z1 = _mm_mul_ps(v_z1, _mm_set1_ps(3.0f));
v_z0 = _mm_add_ps(v_z0, v_x0);
v_z1 = _mm_add_ps(v_z1, v_x1);
v_z0 = _mm_add_ps(v_z0, _mm_mul_ps(v_y0, _mm_set1_ps(15.0f)));
v_z1 = _mm_add_ps(v_z1, _mm_mul_ps(v_y1, _mm_set1_ps(15.0f)));
v_z0 = _mm_max_ps(v_z0, _mm_set1_ps(FLT_EPSILON));
v_z1 = _mm_max_ps(v_z1, _mm_set1_ps(FLT_EPSILON));
__m128 v_d0 = _mm_div_ps(_mm_set1_ps(52.0f), v_z0);
__m128 v_d1 = _mm_div_ps(_mm_set1_ps(52.0f), v_z1);
v_x0 = _mm_mul_ps(v_x0, v_d0);
v_x1 = _mm_mul_ps(v_x1, v_d1);
v_x0 = _mm_sub_ps(v_x0, _mm_set1_ps(un));
v_x1 = _mm_sub_ps(v_x1, _mm_set1_ps(un));
v_g0 = _mm_mul_ps(v_x0, v_r0);
v_g1 = _mm_mul_ps(v_x1, v_r1);
v_y0 = _mm_mul_ps(v_y0, v_d0);
v_y1 = _mm_mul_ps(v_y1, v_d1);
v_y0 = _mm_mul_ps(v_y0, _mm_set1_ps(2.25f));
v_y1 = _mm_mul_ps(v_y1, _mm_set1_ps(2.25f));
v_y0 = _mm_sub_ps(v_y0, _mm_set1_ps(vn));
v_y1 = _mm_sub_ps(v_y1, _mm_set1_ps(vn));
v_b0 = _mm_mul_ps(v_y0, v_r0);
v_b1 = _mm_mul_ps(v_y1, v_r1);
}
#endif
void operator()(const float* src, float* dst, int n) const
{
int i, scn = srccn;
int i = 0, scn = srccn;
float gscale = GammaTabScale;
const float* gammaTab = srgb ? sRGBGammaTab : 0;
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
float _un = 13*un, _vn = 13*vn;
n *= 3;
for( i = 0; i < n; i += 3, src += scn )
#if CV_SSE2
if (haveSIMD)
{
for( ; i <= n - 24; i += 24, src += scn * 8 )
{
__m128 v_r0 = _mm_loadu_ps(src + 0);
__m128 v_r1 = _mm_loadu_ps(src + 4);
__m128 v_g0 = _mm_loadu_ps(src + 8);
__m128 v_g1 = _mm_loadu_ps(src + 12);
__m128 v_b0 = _mm_loadu_ps(src + 16);
__m128 v_b1 = _mm_loadu_ps(src + 20);
if (scn == 3)
{
_mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
}
else
{
__m128 v_a0 = _mm_loadu_ps(src + 24);
__m128 v_a1 = _mm_loadu_ps(src + 28);
_mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1);
}
if ( gammaTab )
{
__m128 v_gscale = _mm_set1_ps(gscale);
v_r0 = _mm_mul_ps(v_r0, v_gscale);
v_r1 = _mm_mul_ps(v_r1, v_gscale);
v_g0 = _mm_mul_ps(v_g0, v_gscale);
v_g1 = _mm_mul_ps(v_g1, v_gscale);
v_b0 = _mm_mul_ps(v_b0, v_gscale);
v_b1 = _mm_mul_ps(v_b1, v_gscale);
splineInterpolate(v_r0, gammaTab, GAMMA_TAB_SIZE);
splineInterpolate(v_r1, gammaTab, GAMMA_TAB_SIZE);
splineInterpolate(v_g0, gammaTab, GAMMA_TAB_SIZE);
splineInterpolate(v_g1, gammaTab, GAMMA_TAB_SIZE);
splineInterpolate(v_b0, gammaTab, GAMMA_TAB_SIZE);
splineInterpolate(v_b1, gammaTab, GAMMA_TAB_SIZE);
}
process(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
_mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
_mm_storeu_ps(dst + i + 0, v_r0);
_mm_storeu_ps(dst + i + 4, v_r1);
_mm_storeu_ps(dst + i + 8, v_g0);
_mm_storeu_ps(dst + i + 12, v_g1);
_mm_storeu_ps(dst + i + 16, v_b0);
_mm_storeu_ps(dst + i + 20, v_b1);
}
}
#endif
for( ; i < n; i += 3, src += scn )
{
float R = src[0], G = src[1], B = src[2];
if( gammaTab )
@ -5801,8 +5956,8 @@ struct RGB2Luv_f
L = 116.f*L - 16.f;
float d = (4*13) / std::max(X + 15 * Y + 3 * Z, FLT_EPSILON);
float u = L*(X*d - _un);
float v = L*((9*0.25f)*Y*d - _vn);
float u = L*(X*d - un);
float v = L*((9*0.25f)*Y*d - vn);
dst[i] = L; dst[i+1] = u; dst[i+2] = v;
}
@ -5811,6 +5966,9 @@ struct RGB2Luv_f
int srccn;
float coeffs[9], un, vn;
bool srgb;
#if CV_SSE2
bool haveSIMD;
#endif
};