From 027769bf5db2c8645b2b9f2b85689b83c373f832 Mon Sep 17 00:00:00 2001 From: Chip Kerchner <49959681+ChipKerchner@users.noreply.github.com> Date: Fri, 11 Oct 2019 11:34:17 -0400 Subject: [PATCH] Merge pull request #15662 from ChipKerchner:addVReverseIntrinsic * New v_reverse HAL intrinsic for reversing the ordering of a vector * Fix conflict. * Try to resolve conflict again. * Try one more time. * Add _MM_SHUFFLE. Remove non-vectorize code in SSE2. Fix copy and paste issue with NEON. * Change v_uint16x8 SSE2 version to use shuffles --- .../include/opencv2/core/hal/intrin_avx.hpp | 48 ++++++++++++ .../opencv2/core/hal/intrin_avx512.hpp | 73 +++++++++++++++++++ .../include/opencv2/core/hal/intrin_cpp.hpp | 21 ++++++ .../include/opencv2/core/hal/intrin_neon.hpp | 46 ++++++++++++ .../include/opencv2/core/hal/intrin_sse.hpp | 53 ++++++++++++++ .../include/opencv2/core/hal/intrin_vsx.hpp | 47 ++++++++++++ modules/core/test/test_intrin_utils.hpp | 26 +++++++ 7 files changed, 314 insertions(+) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 8f6c982c72..fbd6f470cd 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1012,6 +1012,54 @@ OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float64x4, _mm256_castsi256_pd) OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd) +/** Reverse **/ +inline v_uint8x32 v_reverse(const v_uint8x32 &a) +{ + static const __m256i perm = _mm256_setr_epi8( + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __m256i vec = _mm256_shuffle_epi8(a.val, perm); + return v_uint8x32(_mm256_permute2x128_si256(vec, vec, 1)); +} + +inline v_int8x32 v_reverse(const v_int8x32 &a) +{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); } + +inline v_uint16x16 v_reverse(const v_uint16x16 &a) +{ + static const __m256i perm = _mm256_setr_epi8( + 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, + 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + __m256i vec = _mm256_shuffle_epi8(a.val, perm); + return v_uint16x16(_mm256_permute2x128_si256(vec, vec, 1)); +} + +inline v_int16x16 v_reverse(const v_int16x16 &a) +{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); } + +inline v_uint32x8 v_reverse(const v_uint32x8 &a) +{ + static const __m256i perm = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0); + return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm)); +} + +inline v_int32x8 v_reverse(const v_int32x8 &a) +{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_float32x8 v_reverse(const v_float32x8 &a) +{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_uint64x4 v_reverse(const v_uint64x4 &a) +{ + return v_uint64x4(_mm256_permute4x64_epi64(a.val, _MM_SHUFFLE(0, 1, 2, 3))); +} + +inline v_int64x4 v_reverse(const v_int64x4 &a) +{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); } + +inline v_float64x4 v_reverse(const v_float64x4 &a) +{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); } + ////////// Reduce and mask ///////// /** Reduce **/ diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp index 844c546e38..2c31a8d014 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp @@ -1068,6 +1068,79 @@ OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8, epi64) OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps) OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8, pd) +/** Reverse **/ +inline v_uint8x64 v_reverse(const v_uint8x64 &a) +{ +#if CV_AVX_512VBMI + static const __m512i perm = _mm512_set_epi32( + 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f, + 0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f, + 0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f, + 0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f); + return v_uint8x64(_mm512_permutexvar_epi8(perm, a.val)); +#else + static const __m512i shuf = _mm512_set_epi32( + 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f, + 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f, + 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f, + 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); + static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6); + __m512i vec = _mm512_shuffle_epi8(a.val, shuf); + return v_uint8x64(_mm512_permutexvar_epi64(perm, vec)); +#endif +} + +inline v_int8x64 v_reverse(const v_int8x64 &a) +{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); } + +inline v_uint16x32 v_reverse(const v_uint16x32 &a) +{ +#if CV_AVX_512VBMI + static const __m512i perm = _mm512_set_epi32( + 0x00000001, 0x00020003, 0x00040005, 0x00060007, + 0x00080009, 0x000a000b, 0x000c000d, 0x000e000f, + 0x00100011, 0x00120013, 0x00140015, 0x00160017, + 0x00180019, 0x001a001b, 0x001c001d, 0x001e001f); + return v_uint16x32(_mm512_permutexvar_epi16(perm, a.val)); +#else + static const __m512i shuf = _mm512_set_epi32( + 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e, + 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e, + 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e, + 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e); + static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6); + __m512i vec = _mm512_shuffle_epi8(a.val, shuf); + return v_uint16x32(_mm512_permutexvar_epi64(perm, vec)); +#endif +} + +inline v_int16x32 v_reverse(const v_int16x32 &a) +{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); } + +inline v_uint32x16 v_reverse(const v_uint32x16 &a) +{ + static const __m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15); + return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val)); +} + +inline v_int32x16 v_reverse(const v_int32x16 &a) +{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_float32x16 v_reverse(const v_float32x16 &a) +{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_uint64x8 v_reverse(const v_uint64x8 &a) +{ + static const __m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + return v_uint64x8(_mm512_permutexvar_epi64(perm, a.val)); +} + +inline v_int64x8 v_reverse(const v_int64x8 &a) +{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); } + +inline v_float64x8 v_reverse(const v_float64x8 &a) +{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); } + ////////// Reduce ///////// /** Reduce **/ diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 9b3dc84681..61ebd4b982 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -112,6 +112,7 @@ These operations allow to reorder or recombine elements in one or multiple vecto - Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u, @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high +- Reverse: @ref v_reverse - Extract: @ref v_extract @@ -215,6 +216,7 @@ Regular integers: |cvt_flt32 | | | | | | x | |cvt_flt64 | | | | | | x | |transpose4x4 | | | | | x | x | +|reverse | x | x | x | x | x | x | Big integers: @@ -224,6 +226,7 @@ Big integers: |add, sub | x | x | |shift | x | x | |logical | x | x | +|reverse | x | x | |extract | x | x | |rotate (lanes) | x | x | |cvt_flt64 | | x | @@ -250,6 +253,7 @@ Floating point: |transpose4x4 | x | | |extract | x | x | |rotate (lanes) | x | x | +|reverse | x | x | @{ */ @@ -1724,6 +1728,23 @@ inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, } } +/** @brief Vector reverse order + +Reverse the order of the vector +Scheme: +@code + REG {A1 ... An} ==> REG {An ... A1} +@endcode +For all types. */ +template +inline v_reg<_Tp, n> v_reverse(const v_reg<_Tp, n>& a) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = a.s[n-i-1]; + return c; +} + /** @brief Vector extract Scheme: diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 3e8321aca3..abbd635fac 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -1585,6 +1585,52 @@ OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32) OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64) #endif +inline v_uint8x16 v_reverse(const v_uint8x16 &a) +{ + uint8x16_t vec = vrev64q_u8(a.val); + return v_uint8x16(vextq_u8(vec, vec, 8)); +} + +inline v_int8x16 v_reverse(const v_int8x16 &a) +{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); } + +inline v_uint16x8 v_reverse(const v_uint16x8 &a) +{ + uint16x8_t vec = vrev64q_u16(a.val); + return v_uint16x8(vextq_u16(vec, vec, 4)); +} + +inline v_int16x8 v_reverse(const v_int16x8 &a) +{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); } + +inline v_uint32x4 v_reverse(const v_uint32x4 &a) +{ + uint32x4_t vec = vrev64q_u32(a.val); + return v_uint32x4(vextq_u32(vec, vec, 2)); +} + +inline v_int32x4 v_reverse(const v_int32x4 &a) +{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_float32x4 v_reverse(const v_float32x4 &a) +{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_uint64x2 v_reverse(const v_uint64x2 &a) +{ + uint64x2_t vec = a.val; + uint64x1_t vec_lo = vget_low_u64(vec); + uint64x1_t vec_hi = vget_high_u64(vec); + return v_uint64x2(vcombine_u64(vec_hi, vec_lo)); +} + +inline v_int64x2 v_reverse(const v_int64x2 &a) +{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); } + +#if CV_SIMD128_64F +inline v_float64x2 v_reverse(const v_float64x2 &a) +{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); } +#endif + #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \ template \ inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \ diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index c4de1195b5..da167e3401 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1914,6 +1914,59 @@ OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd) +inline v_uint8x16 v_reverse(const v_uint8x16 &a) +{ +#if CV_SSSE3 + static const __m128i perm = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + return v_uint8x16(_mm_shuffle_epi8(a.val, perm)); +#else + uchar CV_DECL_ALIGNED(32) d[16]; + v_store_aligned(d, a); + return v_uint8x16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]); +#endif +} + +inline v_int8x16 v_reverse(const v_int8x16 &a) +{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); } + +inline v_uint16x8 v_reverse(const v_uint16x8 &a) +{ +#if CV_SSSE3 + static const __m128i perm = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + return v_uint16x8(_mm_shuffle_epi8(a.val, perm)); +#else + __m128i r = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3)); + r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(2, 3, 0, 1)); + r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(2, 3, 0, 1)); + return v_uint16x8(r); +#endif +} + +inline v_int16x8 v_reverse(const v_int16x8 &a) +{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); } + +inline v_uint32x4 v_reverse(const v_uint32x4 &a) +{ + return v_uint32x4(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3))); +} + +inline v_int32x4 v_reverse(const v_int32x4 &a) +{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_float32x4 v_reverse(const v_float32x4 &a) +{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_uint64x2 v_reverse(const v_uint64x2 &a) +{ + return v_uint64x2(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(1, 0, 3, 2))); +} + +inline v_int64x2 v_reverse(const v_int64x2 &a) +{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); } + +inline v_float64x2 v_reverse(const v_float64x2 &a) +{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); } + template inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) { diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 0d65ca5e7a..5b4a0d4137 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -678,6 +678,53 @@ OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2) OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2) OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2) +/* Reverse */ +inline v_uint8x16 v_reverse(const v_uint8x16 &a) +{ + static const vec_uchar16 perm = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + vec_uchar16 vec = (vec_uchar16)a.val; + return v_uint8x16(vec_perm(vec, vec, perm)); +} + +inline v_int8x16 v_reverse(const v_int8x16 &a) +{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); } + +inline v_uint16x8 v_reverse(const v_uint16x8 &a) +{ + static const vec_uchar16 perm = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1}; + vec_uchar16 vec = (vec_uchar16)a.val; + return v_reinterpret_as_u16(v_uint8x16(vec_perm(vec, vec, perm))); +} + +inline v_int16x8 v_reverse(const v_int16x8 &a) +{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); } + +inline v_uint32x4 v_reverse(const v_uint32x4 &a) +{ + static const vec_uchar16 perm = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + vec_uchar16 vec = (vec_uchar16)a.val; + return v_reinterpret_as_u32(v_uint8x16(vec_perm(vec, vec, perm))); +} + +inline v_int32x4 v_reverse(const v_int32x4 &a) +{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_float32x4 v_reverse(const v_float32x4 &a) +{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_uint64x2 v_reverse(const v_uint64x2 &a) +{ + static const vec_uchar16 perm = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; + vec_uchar16 vec = (vec_uchar16)a.val; + return v_reinterpret_as_u64(v_uint8x16(vec_perm(vec, vec, perm))); +} + +inline v_int64x2 v_reverse(const v_int64x2 &a) +{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); } + +inline v_float64x2 v_reverse(const v_float64x2 &a) +{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); } + /* Extract */ template inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index fcb6b93a3c..2226502591 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -1115,6 +1115,22 @@ template struct TheTest return *this; } + TheTest & test_reverse() + { + Data dataA; + R a = dataA; + + Data resB = v_reverse(a); + + for (int i = 0; i < R::nlanes; ++i) + { + SCOPED_TRACE(cv::format("i=%d", i)); + EXPECT_EQ(dataA[R::nlanes - i - 1], resB[i]); + } + + return *this; + } + template TheTest & test_extract() { @@ -1459,6 +1475,7 @@ void test_hal_intrin_uint8() .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>() .test_pack_b() .test_unpack() + .test_reverse() .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() ; @@ -1497,6 +1514,7 @@ void test_hal_intrin_int8() .test_popcount() .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() .test_unpack() + .test_reverse() .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() ; @@ -1529,6 +1547,7 @@ void test_hal_intrin_uint16() .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>() .test_unpack() + .test_reverse() .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() ; @@ -1561,6 +1580,7 @@ void test_hal_intrin_int16() .test_popcount() .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() .test_unpack() + .test_reverse() .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() ; @@ -1590,6 +1610,7 @@ void test_hal_intrin_uint32() .test_popcount() .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() .test_unpack() + .test_reverse() .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() .test_transpose() @@ -1619,6 +1640,7 @@ void test_hal_intrin_int32() .test_mask() .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() .test_unpack() + .test_reverse() .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() .test_float_cvt32() @@ -1637,6 +1659,7 @@ void test_hal_intrin_uint64() .test_addsub() .test_shift<1>().test_shift<8>() .test_logic() + .test_reverse() .test_extract<0>().test_extract<1>() .test_rotate<0>().test_rotate<1>() ; @@ -1650,6 +1673,7 @@ void test_hal_intrin_int64() .test_addsub() .test_shift<1>().test_shift<8>() .test_logic() + .test_reverse() .test_extract<0>().test_extract<1>() .test_rotate<0>().test_rotate<1>() .test_cvt64_double() @@ -1680,6 +1704,7 @@ void test_hal_intrin_float32() .test_matmul() .test_transpose() .test_reduce_sum4() + .test_reverse() .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() ; @@ -1709,6 +1734,7 @@ void test_hal_intrin_float64() .test_unpack() .test_float_math() .test_float_cvt32() + .test_reverse() .test_extract<0>().test_extract<1>() .test_rotate<0>().test_rotate<1>() ;