Reworked v_popcount implementation to provide number of bits in a single lane

2025-08-06 14:36:36 +08:00 · 2019-04-01 19:27:50 +03:00 · 2019-04-01 19:27:50 +03:00 · 96ab78dc4f
commit 96ab78dc4f
parent b998c06d08
5 changed files with 296 additions and 127 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@ -1015,6 +1015,34 @@ OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd
 ////////// Reduce and mask /////////

 /** Reduce **/
+inline unsigned v_reduce_sum(const v_uint8x32& a)
+{
+    __m256i half = _mm256_sad_epu8(a.val, _mm256_setzero_si256());
+    __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline int v_reduce_sum(const v_int8x32& a)
+{
+    __m256i half = _mm256_sad_epu8(_mm256_xor_si256(a.val, _mm256_set1_epi8((schar)-128)), _mm256_setzero_si256());
+    __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))) - 4096;
+}
+#define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a) \
+    { \
+        __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
+        val = intrin(val, _mm_srli_si128(val,8)); \
+        val = intrin(val, _mm_srli_si128(val,4)); \
+        val = intrin(val, _mm_srli_si128(val,2)); \
+        val = intrin(val, _mm_srli_si128(val,1)); \
+        return (sctype)_mm_cvtsi128_si32(val); \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, min, _mm_min_epu8)
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32,  schar, min, _mm_min_epi8)
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, max, _mm_max_epu8)
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32,  schar, max, _mm_max_epi8)
+
 #define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
    inline sctype v_reduce_##func(const _Tpvec& a)                  \
    {                                                               \
@ -1062,31 +1090,6 @@ OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,  int,      max, _mm_max_epi32)
 OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
 OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)

-inline ushort v_reduce_sum(const v_uint16x16& a)
-{
-    __m128i a0 = _v256_extract_low(a.val);
-    __m128i a1 = _v256_extract_high(a.val);
-
-    __m128i s0 = _mm_adds_epu16(a0, a1);
-            s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
-            s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
-            s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 2));
-
-    return (ushort)_mm_cvtsi128_si32(s0);
-}
-
-inline short v_reduce_sum(const v_int16x16& a)
-{
-    __m256i s0 = _mm256_hadds_epi16(a.val, a.val);
-            s0 = _mm256_hadds_epi16(s0, s0);
-            s0 = _mm256_hadds_epi16(s0, s0);
-
-    __m128i s1 = _v256_extract_high(s0);
-            s1 = _mm_adds_epi16(_v256_extract_low(s0), s1);
-
-    return (short)_mm_cvtsi128_si32(s1);
-}
-
 inline int v_reduce_sum(const v_int32x8& a)
 {
    __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
@ -1101,6 +1104,11 @@ inline int v_reduce_sum(const v_int32x8& a)
 inline unsigned v_reduce_sum(const v_uint32x8& a)
 { return v_reduce_sum(v_reinterpret_as_s32(a)); }

+inline int v_reduce_sum(const v_int16x16& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline unsigned v_reduce_sum(const v_uint16x16& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
 inline float v_reduce_sum(const v_float32x8& a)
 {
    __m256 s0 = _mm256_hadd_ps(a.val, a.val);
@ -1112,6 +1120,18 @@ inline float v_reduce_sum(const v_float32x8& a)
    return _mm_cvtss_f32(s1);
 }

+inline uint64 v_reduce_sum(const v_uint64x4& a)
+{
+    uint64 CV_DECL_ALIGNED(32) idx[2];
+    _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
+    return idx[0] + idx[1];
+}
+inline int64 v_reduce_sum(const v_int64x4& a)
+{
+    int64 CV_DECL_ALIGNED(32) idx[2];
+    _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
+    return idx[0] + idx[1];
+}
 inline double v_reduce_sum(const v_float64x4& a)
 {
    __m256d s0 = _mm256_hadd_pd(a.val, a.val);
@ -1166,26 +1186,49 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
 }

 /** Popcount **/
-#define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec)                     \
-    inline v_uint32x8 v_popcount(const _Tpvec& a)                \
-    {                                                            \
-        const v_uint32x8 m1 = v256_setall_u32(0x55555555);       \
-        const v_uint32x8 m2 = v256_setall_u32(0x33333333);       \
-        const v_uint32x8 m4 = v256_setall_u32(0x0f0f0f0f);       \
-        v_uint32x8 p  = v_reinterpret_as_u32(a);                 \
-        p = ((p >> 1) & m1) + (p & m1);                          \
-        p = ((p >> 2) & m2) + (p & m2);                          \
-        p = ((p >> 4) & m4) + (p & m4);                          \
-        p.val = _mm256_sad_epu8(p.val, _mm256_setzero_si256());  \
-        return p;                                                \
-    }
-
-OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint8x32)
-OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int8x32)
-OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint16x16)
-OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int16x16)
-OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint32x8)
-OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int32x8)
+inline v_uint8x32 v_popcount(const v_uint8x32& a)
+{
+    __m256i m1 = _mm256_set1_epi32(0x55555555);
+    __m256i m2 = _mm256_set1_epi32(0x33333333);
+    __m256i m4 = _mm256_set1_epi32(0x0f0f0f0f);
+    __m256i p = a.val;
+    p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 1), m1), _mm256_and_si256(p, m1));
+    p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 2), m2), _mm256_and_si256(p, m2));
+    p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 4), m4), _mm256_and_si256(p, m4));
+    return v_uint8x32(p);
+}
+inline v_uint16x16 v_popcount(const v_uint16x16& a)
+{
+    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
+}
+inline v_uint32x8 v_popcount(const v_uint32x8& a)
+{
+    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
+}
+inline v_uint64x4 v_popcount(const v_uint64x4& a)
+{
+    __m256i m1 = _mm256_set1_epi32(0x55555555);
+    __m256i m2 = _mm256_set1_epi32(0x33333333);
+    __m256i m4 = _mm256_set1_epi32(0x0f0f0f0f);
+    __m256i p = a.val;
+    p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 1), m1), _mm256_and_si256(p, m1));
+    p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 2), m2), _mm256_and_si256(p, m2));
+    p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 4), m4), _mm256_and_si256(p, m4));
+    return v_uint64x4(_mm256_sad_epu8(p, _mm256_setzero_si256()));
+}
+inline v_uint8x32 v_popcount(const v_int8x32& a)
+{ return v_popcount(v_reinterpret_as_u8(a)); }
+inline v_uint16x16 v_popcount(const v_int16x16& a)
+{ return v_popcount(v_reinterpret_as_u16(a)); }
+inline v_uint32x8 v_popcount(const v_int32x8& a)
+{ return v_popcount(v_reinterpret_as_u32(a)); }
+inline v_uint64x4 v_popcount(const v_int64x4& a)
+{ return v_popcount(v_reinterpret_as_u64(a)); }

 /** Mask **/
 inline int v_signmask(const v_int8x32& a)
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -910,6 +910,31 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 #endif

+inline unsigned v_reduce_sum(const v_uint8x16& a)
+{
+    uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+}
+inline int v_reduce_sum(const v_int8x16& a)
+{
+    int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
+    int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
+    return vget_lane_s32(vpadd_s32(t1, t1), 0);
+}
+inline unsigned v_reduce_sum(const v_uint16x8& a)
+{
+    uint32x4_t t0 = vpaddlq_u16(a.val);
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+}
+inline int v_reduce_sum(const v_int16x8& a)
+{
+    int32x4_t t0 = vpaddlq_s16(a.val);
+    int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
+    return vget_lane_s32(vpadd_s32(t1, t1), 0);
+}
+
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@ -918,12 +943,10 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
    return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
 }

-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, sum, add, u16)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, max, max, u16)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, min, min, u16)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, sum, add, s16)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, max, max, u16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, min, min, u16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, max, max, s16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, min, min, s16)

 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
@ -942,6 +965,10 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)

+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{ return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0); }
+inline int64 v_reduce_sum(const v_int64x2& a)
+{ return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0); }
 #if CV_SIMD128_64F
 inline double v_reduce_sum(const v_float64x2& a)
 {
@ -1007,21 +1034,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
    return vget_lane_f32(vpadd_f32(t1, t1), 0);
 }

-#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
-inline v_uint32x4 v_popcount(const _Tpvec& a) \
-{ \
-    uint8x16_t t = vcntq_u8(cast(a.val)); \
-    uint16x8_t t0 = vpaddlq_u8(t);  /* 16 -> 8 */ \
-    uint32x4_t t1 = vpaddlq_u16(t0); /* 8 -> 4 */ \
-    return v_uint32x4(t1); \
-}
-
-OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint8x16, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint16x8, vreinterpretq_u8_u16)
-OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint32x4, vreinterpretq_u8_u32)
-OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int8x16, vreinterpretq_u8_s8)
-OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int16x8, vreinterpretq_u8_s16)
-OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int32x4, vreinterpretq_u8_s32)
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{ return v_uint8x16(vcntq_u8(a.val)); }
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }

 inline int v_signmask(const v_uint8x16& a)
 {
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -302,8 +302,8 @@ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)
 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
 { return _Tpvec(cast(a.val)); }

-OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP)
 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
@ -1393,6 +1393,41 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)

+inline unsigned v_reduce_sum(const v_uint8x16& a)
+{
+    __m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128());
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
+}
+inline int v_reduce_sum(const v_int8x16& a)
+{
+    __m128i half = _mm_set1_epi8((schar)-128);
+    half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128());
+    return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048;
+}
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
+inline schar v_reduce_##func(const v_int8x16& a) \
+{ \
+    __m128i val = a.val; \
+    __m128i smask = _mm_set1_epi8((schar)-128); \
+    val = _mm_xor_si128(val, smask); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
+    return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
+} \
+inline uchar v_reduce_##func(const v_uint8x16& a) \
+{ \
+    __m128i val = a.val; \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
+    return (uchar)_mm_cvtsi128_si32(val); \
+}
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min)
+
 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
 { \
@ -1412,26 +1447,8 @@ inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
    return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^  sbit); \
 }
-#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \
-inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
-{ \
-    __m128i val = a.val; \
-    val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
-    val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
-    val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
-    return (scalartype)_mm_cvtsi128_si32(val); \
-} \
-inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
-{ \
-    __m128i val = a.val; \
-    val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
-    val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
-    val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
-    return (unsigned scalartype)_mm_cvtsi128_si32(val); \
-}
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
-OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16)

 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
 inline scalartype v_reduce_sum(const _Tpvec& a) \
@ -1456,6 +1473,23 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)

+inline int v_reduce_sum(const v_int16x8& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline unsigned v_reduce_sum(const v_uint16x8& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{
+    uint64 CV_DECL_ALIGNED(32) idx[2];
+    v_store_aligned(idx, a);
+    return idx[0] + idx[1];
+}
+inline int64 v_reduce_sum(const v_int64x2& a)
+{
+    int64 CV_DECL_ALIGNED(32) idx[2];
+    v_store_aligned(idx, a);
+    return idx[0] + idx[1];
+}
 inline double v_reduce_sum(const v_float64x2& a)
 {
    double CV_DECL_ALIGNED(32) idx[2];
@ -1520,27 +1554,49 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
    return v_reduce_sum(v_absdiff(a, b));
 }

-#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
-inline v_uint32x4 v_popcount(const _Tpvec& a) \
-{ \
-    __m128i m1 = _mm_set1_epi32(0x55555555); \
-    __m128i m2 = _mm_set1_epi32(0x33333333); \
-    __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
-    __m128i p = a.val; \
-    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
-    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
-    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
-    p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
-    p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
-    return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{
+    __m128i m1 = _mm_set1_epi32(0x55555555);
+    __m128i m2 = _mm_set1_epi32(0x33333333);
+    __m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
+    __m128i p = a.val;
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
+    return v_uint8x16(p);
 }
-
-OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16)
-OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8)
-OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4)
-OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16)
-OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8)
-OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4)
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{
+    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
+}
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{
+    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
+}
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{
+    __m128i m1 = _mm_set1_epi32(0x55555555);
+    __m128i m2 = _mm_set1_epi32(0x33333333);
+    __m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
+    __m128i p = a.val;
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
+    return v_uint64x2(_mm_sad_epu8(p, _mm_setzero_si128()));
+}
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_popcount(v_reinterpret_as_u8(a)); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_popcount(v_reinterpret_as_u16(a)); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_popcount(v_reinterpret_as_u32(a)); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_popcount(v_reinterpret_as_u64(a)); }

 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
 inline int v_signmask(const _Tpvec& a) \
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -692,15 +692,27 @@ inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
 ////////// Reduce and mask /////////

 /** Reduce **/
-inline short v_reduce_sum(const v_int16x8& a)
+inline uint v_reduce_sum(const v_uint8x16& a)
+{
+    const vec_uint4 zero4 = vec_uint4_z;
+    vec_uint4 sum4 = vec_sum4s(a.val, zero4);
+    return (uint)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
+}
+inline int v_reduce_sum(const v_int8x16& a)
+{
+    const vec_int4 zero4 = vec_int4_z;
+    vec_int4 sum4 = vec_sum4s(a.val, zero4);
+    return (int)vec_extract(vec_sums(sum4, zero4), 3);
+}
+inline int v_reduce_sum(const v_int16x8& a)
 {
    const vec_int4 zero = vec_int4_z;
-    return saturate_cast<short>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
+    return saturate_cast<int>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
 }
-inline ushort v_reduce_sum(const v_uint16x8& a)
+inline uint v_reduce_sum(const v_uint16x8& a)
 {
    const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8))));
-    return saturate_cast<ushort>(vec_extract(vec_sums(v4, vec_int4_z), 3));
+    return saturate_cast<uint>(vec_extract(vec_sums(v4, vec_int4_z), 3));
 }

 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
@ -719,6 +731,14 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add)
 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max)
 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min)

+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{
+    return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
+}
+inline int64 v_reduce_sum(const v_int64x2& a)
+{
+    return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
+}
 inline double v_reduce_sum(const v_float64x2& a)
 {
    return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
@ -736,6 +756,19 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min)
 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max)
 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min)

+#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(_Tpvec, _Tpvec2, scalartype, suffix, func) \
+inline scalartype v_reduce_##suffix(const _Tpvec& a)                               \
+{                                                                                  \
+    _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8));                            \
+    rs = func(rs, vec_sld(rs, rs, 4));                                             \
+    rs = func(rs, vec_sld(rs, rs, 2));                                             \
+    return vec_extract(func(rs, vec_sld(rs, rs, 1)), 0);                           \
+}
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, min, vec_min)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char8, schar, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char8, schar, min, vec_min)
+
 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
                                 const v_float32x4& c, const v_float32x4& d)
 {
@ -792,9 +825,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
 }

 /** Popcount **/
-template<typename _Tpvec>
-inline v_uint32x4 v_popcount(const _Tpvec& a)
-{ return v_uint32x4(vec_popcntu(vec_uint4_c(a.val))); }
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{ return v_uint8x16(vec_popcntu(a.val)); }
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_uint8x16(vec_popcntu(a.val)); }
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{ return v_uint16x8(vec_popcntu(a.val)); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_uint16x8(vec_popcntu(a.val)); }
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{ return v_uint32x4(vec_popcntu(a.val)); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_uint32x4(vec_popcntu(a.val)); }
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{ return v_uint64x2(vec_popcntu(a.val)); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_uint64x2(vec_popcntu(a.val)); }

 /** Mask **/
 inline int v_signmask(const v_uint8x16& a)
--- a/modules/core/src/stat.simd.hpp
+++ b/modules/core/src/stat.simd.hpp
@ -70,16 +70,14 @@ int normHamming(const uchar* a, int n)
    }
 #endif // CV_POPCNT

-#if CV_SIMD128
+#if CV_SIMD
    {
-        v_uint32x4 t = v_setzero_u32();
-        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
-        {
-            t += v_popcount(v_load(a + i));
-        }
+        v_uint64 t = vx_setzero_u64();
+        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+            t += v_popcount(v_reinterpret_as_u64(vx_load(a + i)));
        result += v_reduce_sum(t);
    }
-#endif // CV_SIMD128
+#endif // CV_SIMD
 #if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4)
    {
@ -141,16 +139,14 @@ int normHamming(const uchar* a, const uchar* b, int n)
    }
 #endif // CV_POPCNT

-#if CV_SIMD128
+#if CV_SIMD
    {
-        v_uint32x4 t = v_setzero_u32();
-        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
-        {
-            t += v_popcount(v_load(a + i) ^ v_load(b + i));
-        }
+        v_uint64 t = vx_setzero_u64();
+        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+            t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i)));
        result += v_reduce_sum(t);
    }
-#endif // CV_SIMD128
+#endif // CV_SIMD
 #if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4)
    {