From 8d22ac200f488eb76d86cdf6ad12581df2980095 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Wed, 11 Dec 2019 20:08:10 +0000 Subject: [PATCH] core: workaround flipHoriz() alignment issues --- modules/core/include/opencv2/core/utility.hpp | 37 +++++++++ modules/core/src/copy.cpp | 77 ++++++++++++++++--- 2 files changed, 102 insertions(+), 12 deletions(-) diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp index e7f169b01a..063747e730 100644 --- a/modules/core/include/opencv2/core/utility.hpp +++ b/modules/core/include/opencv2/core/utility.hpp @@ -514,6 +514,43 @@ static inline size_t roundUp(size_t a, unsigned int b) return a + b - 1 - (a + b - 1) % b; } +/** @brief Alignment check of passed values + +Usage: `isAligned(...)` + +@note Alignment(N) must be a power of 2 (2**k, 2^k) +*/ +template static inline +bool isAligned(const T& data) +{ + CV_StaticAssert((N & (N - 1)) == 0, ""); // power of 2 + return (((size_t)data) & (N - 1)) == 0; +} +/** @overload */ +template static inline +bool isAligned(const void* p1) +{ + return isAligned((size_t)p1); +} +/** @overload */ +template static inline +bool isAligned(const void* p1, const void* p2) +{ + return isAligned(((size_t)p1)|((size_t)p2)); +} +/** @overload */ +template static inline +bool isAligned(const void* p1, const void* p2, const void* p3) +{ + return isAligned(((size_t)p1)|((size_t)p2)|((size_t)p3)); +} +/** @overload */ +template static inline +bool isAligned(const void* p1, const void* p2, const void* p3, const void* p4) +{ + return isAligned(((size_t)p1)|((size_t)p2)|((size_t)p3)|((size_t)p4)); +} + /** @brief Enables or disables the optimized code. The function can be used to dynamically turn on and off optimized dispatched code (code that uses SSE4.2, AVX/AVX2, diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 3f68a2555a..3fa498286a 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -563,6 +563,12 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask) return *this; } +#if CV_NEON && !defined(__aarch64__) +#define CV_CHECK_ALIGNMENT 1 +#else +#define CV_CHECK_ALIGNMENT 0 +#endif + #if CV_SIMD128 template CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz ) { @@ -572,6 +578,10 @@ template CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, s int width_1 = width & -v_uint8x16::nlanes; int i, j; +#if CV_CHECK_ALIGNMENT + CV_Assert(isAligned(src, dst)); +#endif + for( ; size.height--; src += sstep, dst += dstep ) { for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes ) @@ -585,7 +595,7 @@ template CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, s v_store((T*)(dst + j - v_uint8x16::nlanes), t0); v_store((T*)(dst + i), t1); } - if (((size_t)src|(size_t)dst) % sizeof(T) == 0) + if (isAligned(src, dst)) { for ( ; i < width; i += sizeof(T), j -= sizeof(T) ) { @@ -620,6 +630,11 @@ template CV_ALWAYS_INLINE void flipHoriz_double( const int end = (int)(size.width*esz); int width = (end + 1)/2; +#if CV_CHECK_ALIGNMENT + CV_Assert(isAligned(src, dst)); + CV_Assert(isAligned(src, dst)); +#endif + for( ; size.height--; src += sstep, dst += dstep ) { for ( int i = 0, j = end; i < width; i += sizeof(T1) + sizeof(T2), j -= sizeof(T1) + sizeof(T2) ) @@ -644,6 +659,9 @@ static void flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz ) { #if CV_SIMD +#if CV_CHECK_ALIGNMENT + size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep; +#endif if (esz == 2 * v_uint8x16::nlanes) { int end = (int)(size.width*esz); @@ -693,15 +711,27 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, } } } - else if (esz == 8) + else if (esz == 8 +#if CV_CHECK_ALIGNMENT + && isAligned(alignmentMark) +#endif + ) { flipHoriz_single(src, sstep, dst, dstep, size, esz); } - else if (esz == 4) + else if (esz == 4 +#if CV_CHECK_ALIGNMENT + && isAligned(alignmentMark) +#endif + ) { flipHoriz_single(src, sstep, dst, dstep, size, esz); } - else if (esz == 2) + else if (esz == 2 +#if CV_CHECK_ALIGNMENT + && isAligned(alignmentMark) +#endif + ) { flipHoriz_single(src, sstep, dst, dstep, size, esz); } @@ -709,7 +739,11 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, { flipHoriz_single(src, sstep, dst, dstep, size, esz); } - else if (esz == 24) + else if (esz == 24 +#if CV_CHECK_ALIGNMENT + && isAligned(alignmentMark) +#endif + ) { int end = (int)(size.width*esz); int width = (end + 1)/2; @@ -732,6 +766,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, } } } +#if !CV_CHECK_ALIGNMENT else if (esz == 12) { flipHoriz_double(src, sstep, dst, dstep, size, esz); @@ -744,8 +779,9 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, { flipHoriz_double(src, sstep, dst, dstep, size, esz); } - else #endif + else +#endif // CV_SIMD { int i, j, limit = (int)(((size.width + 1)/2)*esz); AutoBuffer _tab(size.width*esz); @@ -779,16 +815,33 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size, { int i = 0; #if CV_SIMD - for( ; i <= size.width - (v_int32::nlanes * 4); i += v_int32::nlanes * 4 ) +#if CV_CHECK_ALIGNMENT + if (isAligned(src0, src1, dst0, dst1)) +#endif { - v_int32 t0 = vx_load((int*)(src0 + i)); - v_int32 t1 = vx_load((int*)(src1 + i)); - vx_store((int*)(dst0 + i), t1); - vx_store((int*)(dst1 + i), t0); + for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH) + { + v_int32 t0 = vx_load((int*)(src0 + i)); + v_int32 t1 = vx_load((int*)(src1 + i)); + vx_store((int*)(dst0 + i), t1); + vx_store((int*)(dst1 + i), t0); + } + } +#if CV_CHECK_ALIGNMENT + else + { + for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH) + { + v_uint8 t0 = vx_load(src0 + i); + v_uint8 t1 = vx_load(src1 + i); + vx_store(dst0 + i, t1); + vx_store(dst1 + i, t0); + } } +#endif #endif - if( ((size_t)src0|(size_t)dst0|(size_t)src1|(size_t)dst1) % sizeof(int) == 0 ) + if (isAligned(src0, src1, dst0, dst1)) { for( ; i <= size.width - 16; i += 16 ) {