From 1095076d7fd5e7ff03aa6965c603a3ff36b115f0 Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Tue, 21 Jun 2016 11:36:52 +0900 Subject: [PATCH 1/2] imgproc: speed up threshold of 64F version using NEON and SSE * use NEON under aarch64 only * check 64F version correctly --- modules/imgproc/src/thresh.cpp | 188 +++++++++++++++++++++++++++ modules/imgproc/test/test_thresh.cpp | 18 +-- 2 files changed, 197 insertions(+), 9 deletions(-) diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp index 13f0fa284b..aa104d0cf3 100644 --- a/modules/imgproc/src/thresh.cpp +++ b/modules/imgproc/src/thresh.cpp @@ -915,6 +915,10 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) size_t src_step = _src.step / sizeof(src[0]); size_t dst_step = _dst.step / sizeof(dst[0]); +#if CV_SSE2 + volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); +#endif + if (_src.isContinuous() && _dst.isContinuous()) { roi.width *= roi.height; @@ -927,6 +931,45 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) { j = 0; +#if CV_SSE2 + if( useSIMD ) + { + __m128d thresh2 = _mm_set1_pd(thresh), maxval2 = _mm_set1_pd(maxval); + for( ; j <= roi.width - 8; j += 8 ) + { + __m128d v0, v1, v2, v3; + v0 = _mm_loadu_pd( src + j ); + v1 = _mm_loadu_pd( src + j + 2 ); + v2 = _mm_loadu_pd( src + j + 4 ); + v3 = _mm_loadu_pd( src + j + 6 ); + v0 = _mm_cmpgt_pd( v0, thresh2 ); + v1 = _mm_cmpgt_pd( v1, thresh2 ); + v2 = _mm_cmpgt_pd( v2, thresh2 ); + v3 = _mm_cmpgt_pd( v3, thresh2 ); + v0 = _mm_and_pd( v0, maxval2 ); + v1 = _mm_and_pd( v1, maxval2 ); + v2 = _mm_and_pd( v2, maxval2 ); + v3 = _mm_and_pd( v3, maxval2 ); + _mm_storeu_pd( dst + j, v0 ); + _mm_storeu_pd( dst + j + 2, v1 ); + _mm_storeu_pd( dst + j + 4, v2 ); + _mm_storeu_pd( dst + j + 6, v3 ); + } + } +#elif CV_NEON && defined(__aarch64__) + float64x2_t v_thresh = vdupq_n_f64(thresh); + uint64x2_t v_maxval = vreinterpretq_u64_f64(vdupq_n_f64(maxval)); + + for( ; j <= roi.width - 4; j += 4 ) + { + float64x2_t v_src0 = vld1q_f64(src + j); + float64x2_t v_src1 = vld1q_f64(src + j + 2); + uint64x2_t v_dst0 = vandq_u64(vcgtq_f64(v_src0, v_thresh), v_maxval); + uint64x2_t v_dst1 = vandq_u64(vcgtq_f64(v_src1, v_thresh), v_maxval); + vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0)); + vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1)); + } +#endif for (; j < roi.width; j++) dst[j] = src[j] > thresh ? maxval : 0; @@ -938,6 +981,45 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) { j = 0; +#if CV_SSE2 + if( useSIMD ) + { + __m128d thresh2 = _mm_set1_pd(thresh), maxval2 = _mm_set1_pd(maxval); + for( ; j <= roi.width - 8; j += 8 ) + { + __m128d v0, v1, v2, v3; + v0 = _mm_loadu_pd( src + j ); + v1 = _mm_loadu_pd( src + j + 2 ); + v2 = _mm_loadu_pd( src + j + 4 ); + v3 = _mm_loadu_pd( src + j + 6 ); + v0 = _mm_cmple_pd( v0, thresh2 ); + v1 = _mm_cmple_pd( v1, thresh2 ); + v2 = _mm_cmple_pd( v2, thresh2 ); + v3 = _mm_cmple_pd( v3, thresh2 ); + v0 = _mm_and_pd( v0, maxval2 ); + v1 = _mm_and_pd( v1, maxval2 ); + v2 = _mm_and_pd( v2, maxval2 ); + v3 = _mm_and_pd( v3, maxval2 ); + _mm_storeu_pd( dst + j, v0 ); + _mm_storeu_pd( dst + j + 2, v1 ); + _mm_storeu_pd( dst + j + 4, v2 ); + _mm_storeu_pd( dst + j + 6, v3 ); + } + } +#elif CV_NEON && defined(__aarch64__) + float64x2_t v_thresh = vdupq_n_f64(thresh); + uint64x2_t v_maxval = vreinterpretq_u64_f64(vdupq_n_f64(maxval)); + + for( ; j <= roi.width - 4; j += 4 ) + { + float64x2_t v_src0 = vld1q_f64(src + j); + float64x2_t v_src1 = vld1q_f64(src + j + 2); + uint64x2_t v_dst0 = vandq_u64(vcleq_f64(v_src0, v_thresh), v_maxval); + uint64x2_t v_dst1 = vandq_u64(vcleq_f64(v_src1, v_thresh), v_maxval); + vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0)); + vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1)); + } +#endif for (; j < roi.width; j++) dst[j] = src[j] <= thresh ? maxval : 0; } @@ -948,6 +1030,40 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) { j = 0; +#if CV_SSE2 + if( useSIMD ) + { + __m128d thresh2 = _mm_set1_pd(thresh); + for( ; j <= roi.width - 8; j += 8 ) + { + __m128d v0, v1, v2, v3; + v0 = _mm_loadu_pd( src + j ); + v1 = _mm_loadu_pd( src + j + 2 ); + v2 = _mm_loadu_pd( src + j + 4 ); + v3 = _mm_loadu_pd( src + j + 6 ); + v0 = _mm_min_pd( v0, thresh2 ); + v1 = _mm_min_pd( v1, thresh2 ); + v2 = _mm_min_pd( v2, thresh2 ); + v3 = _mm_min_pd( v3, thresh2 ); + _mm_storeu_pd( dst + j, v0 ); + _mm_storeu_pd( dst + j + 2, v1 ); + _mm_storeu_pd( dst + j + 4, v2 ); + _mm_storeu_pd( dst + j + 6, v3 ); + } + } +#elif CV_NEON && defined(__aarch64__) + float64x2_t v_thresh = vdupq_n_f64(thresh); + + for( ; j <= roi.width - 4; j += 4 ) + { + float64x2_t v_src0 = vld1q_f64(src + j); + float64x2_t v_src1 = vld1q_f64(src + j + 2); + float64x2_t v_dst0 = vminq_f64(v_src0, v_thresh); + float64x2_t v_dst1 = vminq_f64(v_src1, v_thresh); + vst1q_f64(dst + j, v_dst0); + vst1q_f64(dst + j + 2, v_dst1); + } +#endif for (; j < roi.width; j++) dst[j] = std::min(src[j], thresh); } @@ -958,6 +1074,42 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) { j = 0; +#if CV_SSE2 + if( useSIMD ) + { + __m128d thresh2 = _mm_set1_pd(thresh); + for( ; j <= roi.width - 8; j += 8 ) + { + __m128d v0, v1, v2, v3; + v0 = _mm_loadu_pd( src + j ); + v1 = _mm_loadu_pd( src + j + 2 ); + v2 = _mm_loadu_pd( src + j + 4 ); + v3 = _mm_loadu_pd( src + j + 6 ); + v0 = _mm_and_pd( v0, _mm_cmpgt_pd(v0, thresh2)); + v1 = _mm_and_pd( v1, _mm_cmpgt_pd(v1, thresh2)); + v2 = _mm_and_pd( v2, _mm_cmpgt_pd(v2, thresh2)); + v3 = _mm_and_pd( v3, _mm_cmpgt_pd(v3, thresh2)); + _mm_storeu_pd( dst + j, v0 ); + _mm_storeu_pd( dst + j + 2, v1 ); + _mm_storeu_pd( dst + j + 4, v2 ); + _mm_storeu_pd( dst + j + 6, v3 ); + } + } +#elif CV_NEON && defined(__aarch64__) + float64x2_t v_thresh = vdupq_n_f64(thresh); + + for( ; j <= roi.width - 4; j += 4 ) + { + float64x2_t v_src0 = vld1q_f64(src + j); + float64x2_t v_src1 = vld1q_f64(src + j + 2); + uint64x2_t v_dst0 = vandq_u64(vcgtq_f64(v_src0, v_thresh), + vreinterpretq_u64_f64(v_src0)); + uint64x2_t v_dst1 = vandq_u64(vcgtq_f64(v_src1, v_thresh), + vreinterpretq_u64_f64(v_src1)); + vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0)); + vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1)); + } +#endif for (; j < roi.width; j++) { double v = src[j]; @@ -971,6 +1123,42 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) { j = 0; +#if CV_SSE2 + if( useSIMD ) + { + __m128d thresh2 = _mm_set1_pd(thresh); + for( ; j <= roi.width - 8; j += 8 ) + { + __m128d v0, v1, v2, v3; + v0 = _mm_loadu_pd( src + j ); + v1 = _mm_loadu_pd( src + j + 2 ); + v2 = _mm_loadu_pd( src + j + 4 ); + v3 = _mm_loadu_pd( src + j + 6 ); + v0 = _mm_and_pd( v0, _mm_cmple_pd(v0, thresh2)); + v1 = _mm_and_pd( v1, _mm_cmple_pd(v1, thresh2)); + v2 = _mm_and_pd( v2, _mm_cmple_pd(v2, thresh2)); + v3 = _mm_and_pd( v3, _mm_cmple_pd(v3, thresh2)); + _mm_storeu_pd( dst + j, v0 ); + _mm_storeu_pd( dst + j + 2, v1 ); + _mm_storeu_pd( dst + j + 4, v2 ); + _mm_storeu_pd( dst + j + 6, v3 ); + } + } +#elif CV_NEON && defined(__aarch64__) + float64x2_t v_thresh = vdupq_n_f64(thresh); + + for( ; j <= roi.width - 4; j += 4 ) + { + float64x2_t v_src0 = vld1q_f64(src + j); + float64x2_t v_src1 = vld1q_f64(src + j + 2); + uint64x2_t v_dst0 = vandq_u64(vcleq_f64(v_src0, v_thresh), + vreinterpretq_u64_f64(v_src0)); + uint64x2_t v_dst1 = vandq_u64(vcleq_f64(v_src1, v_thresh), + vreinterpretq_u64_f64(v_src1)); + vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0)); + vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1)); + } +#endif for (; j < roi.width; j++) { double v = src[j]; diff --git a/modules/imgproc/test/test_thresh.cpp b/modules/imgproc/test/test_thresh.cpp index b7db66e989..1ba930a752 100644 --- a/modules/imgproc/test/test_thresh.cpp +++ b/modules/imgproc/test/test_thresh.cpp @@ -75,17 +75,17 @@ void CV_ThreshTest::get_test_array_types_and_sizes( int test_case_idx, vector >& sizes, vector >& types ) { RNG& rng = ts->get_rng(); - int depth = cvtest::randInt(rng) % 3, cn = cvtest::randInt(rng) % 4 + 1; + int depth = cvtest::randInt(rng) % 4, cn = cvtest::randInt(rng) % 4 + 1; cvtest::ArrayTest::get_test_array_types_and_sizes( test_case_idx, sizes, types ); - depth = depth == 0 ? CV_8U : depth == 1 ? CV_16S : CV_32F; + depth = depth == 0 ? CV_8U : depth == 1 ? CV_16S : depth == 2 ? CV_32F : CV_64F; types[INPUT][0] = types[OUTPUT][0] = types[REF_OUTPUT][0] = CV_MAKETYPE(depth,cn); thresh_type = cvtest::randInt(rng) % 5; if( depth == CV_8U ) { - thresh_val = (float)(cvtest::randReal(rng)*350. - 50.); - max_val = (float)(cvtest::randReal(rng)*350. - 50.); + thresh_val = (cvtest::randReal(rng)*350. - 50.); + max_val = (cvtest::randReal(rng)*350. - 50.); if( cvtest::randInt(rng)%4 == 0 ) max_val = 255.f; } @@ -93,15 +93,15 @@ void CV_ThreshTest::get_test_array_types_and_sizes( int test_case_idx, { float min_val = SHRT_MIN-100.f; max_val = SHRT_MAX+100.f; - thresh_val = (float)(cvtest::randReal(rng)*(max_val - min_val) + min_val); - max_val = (float)(cvtest::randReal(rng)*(max_val - min_val) + min_val); + thresh_val = (cvtest::randReal(rng)*(max_val - min_val) + min_val); + max_val = (cvtest::randReal(rng)*(max_val - min_val) + min_val); if( cvtest::randInt(rng)%4 == 0 ) - max_val = (float)SHRT_MAX; + max_val = (double)SHRT_MAX; } else { - thresh_val = (float)(cvtest::randReal(rng)*1000. - 500.); - max_val = (float)(cvtest::randReal(rng)*1000. - 500.); + thresh_val = (cvtest::randReal(rng)*1000. - 500.); + max_val = (cvtest::randReal(rng)*1000. - 500.); } } From 9fca953e62b91ce3d9c64722d56149fe13c07f4b Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Wed, 13 Jul 2016 07:10:20 +0900 Subject: [PATCH 2/2] check the CPU flag correctly --- modules/imgproc/src/thresh.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp index aa104d0cf3..a63c59ef34 100644 --- a/modules/imgproc/src/thresh.cpp +++ b/modules/imgproc/src/thresh.cpp @@ -397,7 +397,7 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) size_t dst_step = _dst.step/sizeof(dst[0]); #if CV_SSE2 - volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); + volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif if( _src.isContinuous() && _dst.isContinuous() ) @@ -665,7 +665,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) size_t src_step = _src.step/sizeof(src[0]); size_t dst_step = _dst.step/sizeof(dst[0]); -#if CV_SSE2 +#if CV_SSE volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); #endif @@ -720,7 +720,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; -#if CV_SSE2 +#if CV_SSE if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval); @@ -758,7 +758,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; -#if CV_SSE2 +#if CV_SSE if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval); @@ -796,7 +796,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; -#if CV_SSE2 +#if CV_SSE if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); @@ -827,7 +827,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; -#if CV_SSE2 +#if CV_SSE if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); @@ -866,7 +866,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; -#if CV_SSE2 +#if CV_SSE if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); @@ -916,7 +916,7 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) size_t dst_step = _dst.step / sizeof(dst[0]); #if CV_SSE2 - volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); + volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif if (_src.isContinuous() && _dst.isContinuous())