From a5f68e98f4ec18036ef800482127f0f72b0e26a7 Mon Sep 17 00:00:00 2001 From: k-shinotsuka Date: Mon, 30 Apr 2018 16:38:49 +0900 Subject: [PATCH] add universal intrinsics for HLS2RGB_f --- modules/imgproc/src/color_hsv.cpp | 198 +++++++++++------------------- 1 file changed, 71 insertions(+), 127 deletions(-) diff --git a/modules/imgproc/src/color_hsv.cpp b/modules/imgproc/src/color_hsv.cpp index 45f5bdb21d..81530289dd 100644 --- a/modules/imgproc/src/color_hsv.cpp +++ b/modules/imgproc/src/color_hsv.cpp @@ -872,167 +872,111 @@ struct HLS2RGB_f HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange) : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) { - #if CV_SSE2 - haveSIMD = checkHardwareSupport(CV_CPU_SSE2); + #if CV_SIMD128 + hasSIMD = hasSIMD128(); #endif } - #if CV_SSE2 - void process(__m128& v_h0, __m128& v_h1, __m128& v_l0, - __m128& v_l1, __m128& v_s0, __m128& v_s1) const + #if CV_SIMD128 + inline void process(v_float32x4& v_h, v_float32x4& v_l, v_float32x4& v_s) const { - __m128 v_lel0 = _mm_cmple_ps(v_l0, _mm_set1_ps(0.5f)); - __m128 v_lel1 = _mm_cmple_ps(v_l1, _mm_set1_ps(0.5f)); - __m128 v_p20 = _mm_andnot_ps(v_lel0, _mm_sub_ps(_mm_add_ps(v_l0, v_s0), _mm_mul_ps(v_l0, v_s0))); - __m128 v_p21 = _mm_andnot_ps(v_lel1, _mm_sub_ps(_mm_add_ps(v_l1, v_s1), _mm_mul_ps(v_l1, v_s1))); - v_p20 = _mm_or_ps(v_p20, _mm_and_ps(v_lel0, _mm_mul_ps(v_l0, _mm_add_ps(_mm_set1_ps(1.0f), v_s0)))); - v_p21 = _mm_or_ps(v_p21, _mm_and_ps(v_lel1, _mm_mul_ps(v_l1, _mm_add_ps(_mm_set1_ps(1.0f), v_s1)))); + v_float32x4 v_one = v_setall_f32(1.0f); - __m128 v_p10 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2.0f), v_l0), v_p20); - __m128 v_p11 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2.0f), v_l1), v_p21); + v_float32x4 v_l_le_half = v_l <= v_setall_f32(0.5f); + v_float32x4 v_ls = v_l * v_s; + v_float32x4 v_elem0 = v_select(v_l_le_half, v_ls, v_s - v_ls); - v_h0 = _mm_mul_ps(v_h0, _mm_set1_ps(hscale)); - v_h1 = _mm_mul_ps(v_h1, _mm_set1_ps(hscale)); + v_float32x4 v_hs_raw = v_h * v_setall_f32(hscale); + v_float32x4 v_pre_hs = v_cvt_f32(v_trunc(v_hs_raw)); + v_float32x4 v_hs = v_hs_raw - v_pre_hs; + v_float32x4 v_sector = v_pre_hs - v_setall_f32(6.0f) * v_cvt_f32(v_trunc(v_hs_raw * v_setall_f32(1.0f / 6.0f))); + v_float32x4 v_elem1 = v_hs + v_hs; - __m128 v_pre_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h0)); - __m128 v_pre_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h1)); + v_float32x4 v_tab0 = v_l + v_elem0; + v_float32x4 v_tab1 = v_l - v_elem0; + v_float32x4 v_tab2 = v_l + v_elem0 - v_elem0 * v_elem1; + v_float32x4 v_tab3 = v_l - v_elem0 + v_elem0 * v_elem1; - v_h0 = _mm_sub_ps(v_h0, v_pre_sector0); - v_h1 = _mm_sub_ps(v_h1, v_pre_sector1); + v_float32x4 v_two = v_setall_f32(2.0f); + v_float32x4 v_four = v_setall_f32(4.0f); - __m128 v_p2_p10 = _mm_sub_ps(v_p20, v_p10); - __m128 v_p2_p11 = _mm_sub_ps(v_p21, v_p11); - __m128 v_tab20 = _mm_add_ps(v_p10, _mm_mul_ps(v_p2_p10, _mm_sub_ps(_mm_set1_ps(1.0f), v_h0))); - __m128 v_tab21 = _mm_add_ps(v_p11, _mm_mul_ps(v_p2_p11, _mm_sub_ps(_mm_set1_ps(1.0f), v_h1))); - __m128 v_tab30 = _mm_add_ps(v_p10, _mm_mul_ps(v_p2_p10, v_h0)); - __m128 v_tab31 = _mm_add_ps(v_p11, _mm_mul_ps(v_p2_p11, v_h1)); + v_h = v_select(v_sector < v_two , v_tab1, + v_select(v_sector <= v_two , v_tab3, + v_select(v_sector <= v_four, v_tab0, v_tab2))); - __m128 v_sector0 = _mm_div_ps(v_pre_sector0, _mm_set1_ps(6.0f)); - __m128 v_sector1 = _mm_div_ps(v_pre_sector1, _mm_set1_ps(6.0f)); - v_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector0)); - v_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector1)); - v_sector0 = _mm_mul_ps(v_sector0, _mm_set1_ps(6.0f)); - v_sector1 = _mm_mul_ps(v_sector1, _mm_set1_ps(6.0f)); - v_sector0 = _mm_sub_ps(v_pre_sector0, v_sector0); - v_sector1 = _mm_sub_ps(v_pre_sector1, v_sector1); + v_l = v_select(v_sector < v_one , v_tab3, + v_select(v_sector <= v_two , v_tab0, + v_select(v_sector < v_four, v_tab2, v_tab1))); - v_h0 = _mm_and_ps(v_p10, _mm_cmplt_ps(v_sector0, _mm_set1_ps(2.0f))); - v_h1 = _mm_and_ps(v_p11, _mm_cmplt_ps(v_sector1, _mm_set1_ps(2.0f))); - v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f)))); - v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f)))); - v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f)))); - v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f)))); - v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f)))); - v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f)))); - v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f)))); - v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f)))); - v_l0 = _mm_and_ps(v_tab30, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f))); - v_l1 = _mm_and_ps(v_tab31, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f))); - v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f)))); - v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f)))); - v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f)))); - v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f)))); - v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f)))); - v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f)))); - v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p10, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(3.0f)))); - v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p11, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(3.0f)))); - v_s0 = _mm_and_ps(v_p20, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f))); - v_s1 = _mm_and_ps(v_p21, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f))); - v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f)))); - v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f)))); - v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f)))); - v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f)))); - v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f)))); - v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f)))); - v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f)))); - v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f)))); - v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f)))); - v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f)))); + v_s = v_select(v_sector < v_one , v_tab0, + v_select(v_sector < v_two , v_tab2, + v_select(v_sector < v_four, v_tab1, + v_select(v_sector <= v_four, v_tab3, v_tab0)))); } #endif void operator()(const float* src, float* dst, int n) const { int i = 0, bidx = blueIdx, dcn = dstcn; - float _hscale = hscale; float alpha = ColorChannel::max(); n *= 3; - #if CV_SSE2 - if (haveSIMD) + #if CV_SIMD128 + if (hasSIMD) { - for( ; i <= n - 24; i += 24, dst += dcn * 8 ) + if (dcn == 3) { - __m128 v_h0 = _mm_loadu_ps(src + i + 0); - __m128 v_h1 = _mm_loadu_ps(src + i + 4); - __m128 v_l0 = _mm_loadu_ps(src + i + 8); - __m128 v_l1 = _mm_loadu_ps(src + i + 12); - __m128 v_s0 = _mm_loadu_ps(src + i + 16); - __m128 v_s1 = _mm_loadu_ps(src + i + 20); - - _mm_deinterleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); - - process(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); - - if (dcn == 3) + if (bidx) { - if (bidx) + for (; i <= n - 12; i += 12, dst += dcn * 4) { - _mm_interleave_ps(v_s0, v_s1, v_l0, v_l1, v_h0, v_h1); - - _mm_storeu_ps(dst + 0, v_s0); - _mm_storeu_ps(dst + 4, v_s1); - _mm_storeu_ps(dst + 8, v_l0); - _mm_storeu_ps(dst + 12, v_l1); - _mm_storeu_ps(dst + 16, v_h0); - _mm_storeu_ps(dst + 20, v_h1); + v_float32x4 v_h; + v_float32x4 v_l; + v_float32x4 v_s; + v_load_deinterleave(src + i, v_h, v_l, v_s); + process(v_h, v_l, v_s); + v_store_interleave(dst, v_s, v_l, v_h); } - else + } else { + for (; i <= n - 12; i += 12, dst += dcn * 4) { - _mm_interleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); - - _mm_storeu_ps(dst + 0, v_h0); - _mm_storeu_ps(dst + 4, v_h1); - _mm_storeu_ps(dst + 8, v_l0); - _mm_storeu_ps(dst + 12, v_l1); - _mm_storeu_ps(dst + 16, v_s0); - _mm_storeu_ps(dst + 20, v_s1); + v_float32x4 v_h; + v_float32x4 v_l; + v_float32x4 v_s; + v_load_deinterleave(src + i, v_h, v_l, v_s); + process(v_h, v_l, v_s); + v_store_interleave(dst, v_h, v_l, v_s); } } - else + } else { // dcn == 4 + if (bidx) { - __m128 v_a0 = _mm_set1_ps(alpha); - __m128 v_a1 = _mm_set1_ps(alpha); - if (bidx) + for (; i <= n - 12; i += 12, dst += dcn * 4) { - _mm_interleave_ps(v_s0, v_s1, v_l0, v_l1, v_h0, v_h1, v_a0, v_a1); - - _mm_storeu_ps(dst + 0, v_s0); - _mm_storeu_ps(dst + 4, v_s1); - _mm_storeu_ps(dst + 8, v_l0); - _mm_storeu_ps(dst + 12, v_l1); - _mm_storeu_ps(dst + 16, v_h0); - _mm_storeu_ps(dst + 20, v_h1); - _mm_storeu_ps(dst + 24, v_a0); - _mm_storeu_ps(dst + 28, v_a1); + v_float32x4 v_h; + v_float32x4 v_l; + v_float32x4 v_s; + v_load_deinterleave(src + i, v_h, v_l, v_s); + process(v_h, v_l, v_s); + v_float32x4 v_a = v_setall_f32(alpha); + v_store_interleave(dst, v_s, v_l, v_h, v_a); } - else + } else { + for (; i <= n - 12; i += 12, dst += dcn * 4) { - _mm_interleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1, v_a0, v_a1); - - _mm_storeu_ps(dst + 0, v_h0); - _mm_storeu_ps(dst + 4, v_h1); - _mm_storeu_ps(dst + 8, v_l0); - _mm_storeu_ps(dst + 12, v_l1); - _mm_storeu_ps(dst + 16, v_s0); - _mm_storeu_ps(dst + 20, v_s1); - _mm_storeu_ps(dst + 24, v_a0); - _mm_storeu_ps(dst + 28, v_a1); + v_float32x4 v_h; + v_float32x4 v_l; + v_float32x4 v_s; + v_load_deinterleave(src + i, v_h, v_l, v_s); + process(v_h, v_l, v_s); + v_float32x4 v_a = v_setall_f32(alpha); + v_store_interleave(dst, v_h, v_l, v_s, v_a); } } } } #endif + for( ; i < n; i += 3, dst += dcn ) { float h = src[i], l = src[i+1], s = src[i+2]; @@ -1050,7 +994,7 @@ struct HLS2RGB_f float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s; float p1 = 2*l - p2; - h *= _hscale; + h *= hscale; if( h < 0 ) do h += 6; while( h < 0 ); else if( h >= 6 ) @@ -1080,8 +1024,8 @@ struct HLS2RGB_f int dstcn, blueIdx; float hscale; - #if CV_SSE2 - bool haveSIMD; + #if CV_SIMD128 + bool hasSIMD; #endif };