improve RGB2Gray<ushort>()

This commit is contained in:
k-shinotsuka 2016-08-03 23:42:24 +09:00
parent b34272f8a2
commit 020b47c41a

View File

@ -1492,36 +1492,47 @@ struct RGB2Gray<ushort>
if( blueIdx == 0 )
std::swap(coeffs[0], coeffs[2]);
v_cb = _mm_set1_epi16((short)coeffs[0]);
v_cg = _mm_set1_epi16((short)coeffs[1]);
v_cr = _mm_set1_epi16((short)coeffs[2]);
v_delta = _mm_set1_epi32(1 << (yuv_shift - 1));
v_zero = _mm_setzero_si128();
haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
}
// 16s x 8
void process(__m128i v_b, __m128i v_g, __m128i v_r,
void process(__m128i* v_rgb, __m128i* v_coeffs,
__m128i & v_gray) const
{
__m128i v_mullo_r = _mm_mullo_epi16(v_r, v_cr);
__m128i v_mullo_g = _mm_mullo_epi16(v_g, v_cg);
__m128i v_mullo_b = _mm_mullo_epi16(v_b, v_cb);
__m128i v_mulhi_r = _mm_mulhi_epu16(v_r, v_cr);
__m128i v_mulhi_g = _mm_mulhi_epu16(v_g, v_cg);
__m128i v_mulhi_b = _mm_mulhi_epu16(v_b, v_cb);
__m128i v_rgb_hi[4];
v_rgb_hi[0] = _mm_cmplt_epi16(v_rgb[0], v_zero);
v_rgb_hi[1] = _mm_cmplt_epi16(v_rgb[1], v_zero);
v_rgb_hi[2] = _mm_cmplt_epi16(v_rgb[2], v_zero);
v_rgb_hi[3] = _mm_cmplt_epi16(v_rgb[3], v_zero);
__m128i v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_r, v_mulhi_r),
_mm_unpacklo_epi16(v_mullo_g, v_mulhi_g));
v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), v_gray0);
v_gray0 = _mm_srli_epi32(_mm_add_epi32(v_gray0, v_delta), yuv_shift);
v_rgb_hi[0] = _mm_and_si128(v_rgb_hi[0], v_coeffs[1]);
v_rgb_hi[1] = _mm_and_si128(v_rgb_hi[1], v_coeffs[1]);
v_rgb_hi[2] = _mm_and_si128(v_rgb_hi[2], v_coeffs[1]);
v_rgb_hi[3] = _mm_and_si128(v_rgb_hi[3], v_coeffs[1]);
__m128i v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_r, v_mulhi_r),
_mm_unpackhi_epi16(v_mullo_g, v_mulhi_g));
v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), v_gray1);
v_gray1 = _mm_srli_epi32(_mm_add_epi32(v_gray1, v_delta), yuv_shift);
v_rgb_hi[0] = _mm_hadd_epi16(v_rgb_hi[0], v_rgb_hi[1]);
v_rgb_hi[2] = _mm_hadd_epi16(v_rgb_hi[2], v_rgb_hi[3]);
v_rgb_hi[0] = _mm_hadd_epi16(v_rgb_hi[0], v_rgb_hi[2]);
v_gray = _mm_packus_epi32(v_gray0, v_gray1);
v_rgb[0] = _mm_madd_epi16(v_rgb[0], v_coeffs[0]);
v_rgb[1] = _mm_madd_epi16(v_rgb[1], v_coeffs[0]);
v_rgb[2] = _mm_madd_epi16(v_rgb[2], v_coeffs[0]);
v_rgb[3] = _mm_madd_epi16(v_rgb[3], v_coeffs[0]);
v_rgb[0] = _mm_hadd_epi32(v_rgb[0], v_rgb[1]);
v_rgb[2] = _mm_hadd_epi32(v_rgb[2], v_rgb[3]);
v_rgb[0] = _mm_add_epi32(v_rgb[0], v_delta);
v_rgb[2] = _mm_add_epi32(v_rgb[2], v_delta);
v_rgb[0] = _mm_srai_epi32(v_rgb[0], yuv_shift);
v_rgb[2] = _mm_srai_epi32(v_rgb[2], yuv_shift);
v_gray = _mm_packs_epi32(v_rgb[0], v_rgb[2]);
v_gray = _mm_add_epi16(v_gray, v_rgb_hi[0]);
}
void operator()(const ushort* src, ushort* dst, int n) const
@ -1530,54 +1541,49 @@ struct RGB2Gray<ushort>
if (scn == 3 && haveSIMD)
{
for ( ; i <= n - 16; i += 16, src += scn * 16)
__m128i v_coeffs[2];
v_coeffs[0] = _mm_set_epi16(0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], 0);
v_coeffs[1] = _mm_slli_epi16(v_coeffs[0], 2);
for ( ; i <= n - 8; i += 8, src += scn * 8)
{
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
__m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8));
__m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16));
__m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24));
__m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32));
__m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40));
__m128i v_src[2];
v_src[0] = _mm_loadu_si128((__m128i const *)(src));
v_src[1] = _mm_loadu_si128((__m128i const *)(src + 8));
v_src[2] = _mm_loadu_si128((__m128i const *)(src + 16));
_mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
__m128i v_rgb[4];
v_rgb[0] = _mm_slli_si128(v_src[0], 2);
v_rgb[1] = _mm_alignr_epi8(v_src[1], v_src[0], 10);
v_rgb[2] = _mm_alignr_epi8(v_src[2], v_src[1], 6);
v_rgb[3] = _mm_srli_si128(v_src[2], 2);
__m128i v_gray0;
process(v_r0, v_g0, v_b0,
v_gray0);
__m128i v_gray;
process(v_rgb, v_coeffs,
v_gray);
__m128i v_gray1;
process(v_r1, v_g1, v_b1,
v_gray1);
_mm_storeu_si128((__m128i *)(dst + i), v_gray0);
_mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1);
_mm_storeu_si128((__m128i *)(dst + i), v_gray);
}
}
else if (scn == 4 && haveSIMD)
{
for ( ; i <= n - 16; i += 16, src += scn * 16)
__m128i v_coeffs[2];
v_coeffs[0] = _mm_set_epi16(0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], 0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0]);
v_coeffs[1] = _mm_slli_epi16(v_coeffs[0], 2);
for ( ; i <= n - 8; i += 8, src += scn * 8)
{
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
__m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8));
__m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16));
__m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24));
__m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32));
__m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40));
__m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48));
__m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56));
__m128i v_rgb[4];
v_rgb[0] = _mm_loadu_si128((__m128i const *)(src));
v_rgb[1] = _mm_loadu_si128((__m128i const *)(src + 8));
v_rgb[2] = _mm_loadu_si128((__m128i const *)(src + 16));
v_rgb[3] = _mm_loadu_si128((__m128i const *)(src + 24));
_mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1);
__m128i v_gray;
process(v_rgb, v_coeffs,
v_gray);
__m128i v_gray0;
process(v_r0, v_g0, v_b0,
v_gray0);
__m128i v_gray1;
process(v_r1, v_g1, v_b1,
v_gray1);
_mm_storeu_si128((__m128i *)(dst + i), v_gray0);
_mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1);
_mm_storeu_si128((__m128i *)(dst + i), v_gray);
}
}
@ -1586,8 +1592,8 @@ struct RGB2Gray<ushort>
}
int srccn, coeffs[3];
__m128i v_cb, v_cg, v_cr;
__m128i v_delta;
__m128i v_zero;
bool haveSIMD;
};