opencv/modules/core/src/stat.simd.hpp

// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.

#include "opencv2/core/hal/intrin.hpp"

namespace cv { namespace hal {

extern const uchar popCountTable[256];

CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN

// forward declarations
int normHamming(const uchar* a, int n);
int normHamming(const uchar* a, const uchar* b, int n);

#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

#if CV_AVX2
static inline int _mm256_extract_epi32_(__m256i reg, const int i)
{
    CV_DECL_ALIGNED(32) int reg_data[8];
    CV_DbgAssert(0 <= i && i < 8);
    _mm256_store_si256((__m256i*)reg_data, reg);
    return reg_data[i];
}
#endif

int normHamming(const uchar* a, int n)
{
    CV_AVX_GUARD;

    int i = 0;
    int result = 0;

#if (CV_SIMD || CV_SIMD_SCALABLE)
    {
        v_uint64 t = vx_setzero_u64();
        for (; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
            t = v_add(t, v_popcount(v_reinterpret_as_u64(vx_load(a + i))));
        result = (int)v_reduce_sum(t);
        vx_cleanup();
    }
#endif

#if CV_POPCNT
    {
#  if defined CV_POPCNT_U64
        for(; i <= n - 8; i += 8)
        {
            result += (int)CV_POPCNT_U64(*(uint64*)(a + i));
        }
#  endif
        for(; i <= n - 4; i += 4)
        {
            result += CV_POPCNT_U32(*(uint*)(a + i));
        }
    }
#endif
#if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4)
    {
        result += popCountTable[a[i]] + popCountTable[a[i+1]] +
        popCountTable[a[i+2]] + popCountTable[a[i+3]];
    }
#endif
    for(; i < n; i++)
    {
        result += popCountTable[a[i]];
    }
    return result;
}

int normHamming(const uchar* a, const uchar* b, int n)
{
    CV_AVX_GUARD;

    int i = 0;
    int result = 0;

#if (CV_SIMD || CV_SIMD_SCALABLE)
    {
        v_uint64 t = vx_setzero_u64();
        for (; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_xor(vx_load(a + i), vx_load(b + i)))));
        result += (int)v_reduce_sum(t);
    }
#endif

#if CV_POPCNT
    {
#  if defined CV_POPCNT_U64
        for(; i <= n - 8; i += 8)
        {
            result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i));
        }
#  endif
        for(; i <= n - 4; i += 4)
        {
            result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
        }
    }
#endif
#if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4)
    {
        result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
                popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
    }
#endif
    for(; i < n; i++)
    {
        result += popCountTable[a[i] ^ b[i]];
    }
    return result;
}

#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

CV_CPU_OPTIMIZATION_NAMESPACE_END
}} //cv::hal
core(stat): move implementations into .hpp file w/o changes 2017-07-02 21:07:58 +08:00			`// This file is part of OpenCV project.`
			`// It is subject to the license terms in the LICENSE file found in the top-level directory`
			`// of this distribution and at http://opencv.org/license.html.`

			`#include "opencv2/core/hal/intrin.hpp"`

			`namespace cv { namespace hal {`
core(stat): register dispatched code, fix build 2017-07-02 21:23:13 +08:00
			`extern const uchar popCountTable[256];`

core(stat): move implementations into .hpp file w/o changes 2017-07-02 21:07:58 +08:00			`CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN`

			`// forward declarations`
			`int normHamming(const uchar* a, int n);`
			`int normHamming(const uchar* a, const uchar* b, int n);`

			`#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY`

			`#if CV_AVX2`
			`static inline int _mm256_extract_epi32_(__m256i reg, const int i)`
			`{`
			`CV_DECL_ALIGNED(32) int reg_data[8];`
			`CV_DbgAssert(0 <= i && i < 8);`
			`_mm256_store_si256((__m256i*)reg_data, reg);`
			`return reg_data[i];`
			`}`
			`#endif`

			`int normHamming(const uchar* a, int n)`
			`{`
core(stat): add required CV_AVX_GUARD Added guard with 'vzeroupper' instruction 2017-07-02 23:33:05 +08:00			`CV_AVX_GUARD;`

core(stat): move implementations into .hpp file w/o changes 2017-07-02 21:07:58 +08:00			`int i = 0;`
			`int result = 0;`

Merge pull request #23980 from hanliutong:rewrite-core Rewrite Universal Intrinsic code by using new API: Core module. #23980 The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API. The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885. Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are: 1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited - ./modules/core/src/stat.simd.hpp - ./modules/core/src/matrix_transform.cpp - ./modules/core/src/matmul.simd.hpp 2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly. - ./modules/core/src/mathfuncs_core.simd.hpp ```cpp struct v_atan_f32 { explicit v_atan_f32(const float& scale) { ... } v_float32 compute(const v_float32& y, const v_float32& x) { ... } ... v_float32 val90; // sizeless type can not used in a class v_float32 val180; v_float32 val360; v_float32 s; }; ``` 3. The API interface does not support/does not match - ./modules/core/src/norm.cpp Use `v_popcount`, ~~waiting for #23966~~ Fixed - ./modules/core/src/has_non_zero.simd.hpp Use illegal Universal Intrinsic API: For float type, there is no logical operation `\|`. Further discussion needed ```cpp /** @brief Bitwise OR Only for integer types. / template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator\|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator\|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); ``` ```cpp #if CV_SIMD typedef v_float32 v_type; const v_type v_zero = vx_setzero_f32(); constexpr const int unrollCount = 8; int step = v_type::nlanes unrollCount; int len0 = len & -step; const float* srcSimdEnd = src+len0; int countSIMD = static_cast<int>((srcSimdEnd-src)/step); while(!res && countSIMD--) { v_type v0 = vx_load(src); src += v_type::nlanes; v_type v1 = vx_load(src); src += v_type::nlanes; .... src += v_type::nlanes; v0 \|= v1; //Illegal ? .... //res = v_check_any(((v0 \| v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ res = !v_check_all(((v0 \| v4) == v_zero)); } v_cleanup(); #endif ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake 2023-08-11 13:33:33 +08:00			`#if (CV_SIMD \|\| CV_SIMD_SCALABLE)`
Updated AVX2 implementation of v_popcount for u8. 2019-05-14 23:48:36 +08:00			`{`
			`v_uint64 t = vx_setzero_u64();`
Merge pull request #23980 from hanliutong:rewrite-core Rewrite Universal Intrinsic code by using new API: Core module. #23980 The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API. The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885. Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are: 1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited - ./modules/core/src/stat.simd.hpp - ./modules/core/src/matrix_transform.cpp - ./modules/core/src/matmul.simd.hpp 2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly. - ./modules/core/src/mathfuncs_core.simd.hpp ```cpp struct v_atan_f32 { explicit v_atan_f32(const float& scale) { ... } v_float32 compute(const v_float32& y, const v_float32& x) { ... } ... v_float32 val90; // sizeless type can not used in a class v_float32 val180; v_float32 val360; v_float32 s; }; ``` 3. The API interface does not support/does not match - ./modules/core/src/norm.cpp Use `v_popcount`, ~~waiting for #23966~~ Fixed - ./modules/core/src/has_non_zero.simd.hpp Use illegal Universal Intrinsic API: For float type, there is no logical operation `\|`. Further discussion needed ```cpp /** @brief Bitwise OR Only for integer types. / template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator\|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator\|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); ``` ```cpp #if CV_SIMD typedef v_float32 v_type; const v_type v_zero = vx_setzero_f32(); constexpr const int unrollCount = 8; int step = v_type::nlanes unrollCount; int len0 = len & -step; const float* srcSimdEnd = src+len0; int countSIMD = static_cast<int>((srcSimdEnd-src)/step); while(!res && countSIMD--) { v_type v0 = vx_load(src); src += v_type::nlanes; v_type v1 = vx_load(src); src += v_type::nlanes; .... src += v_type::nlanes; v0 \|= v1; //Illegal ? .... //res = v_check_any(((v0 \| v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ res = !v_check_all(((v0 \| v4) == v_zero)); } v_cleanup(); #endif ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake 2023-08-11 13:33:33 +08:00			`for (; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())`
			`t = v_add(t, v_popcount(v_reinterpret_as_u64(vx_load(a + i))));`
Updated AVX2 implementation of v_popcount for u8. 2019-05-14 23:48:36 +08:00			`result = (int)v_reduce_sum(t);`
Improve vectorization in the 'norm' functions 2019-08-28 01:15:19 +08:00			`vx_cleanup();`
core(stat): move implementations into .hpp file w/o changes 2017-07-02 21:07:58 +08:00			`}`
Updated AVX2 implementation of v_popcount for u8. 2019-05-14 23:48:36 +08:00			`#endif`
core(stat): move implementations into .hpp file w/o changes 2017-07-02 21:07:58 +08:00
			`#if CV_POPCNT`
			`{`
			`# if defined CV_POPCNT_U64`
			`for(; i <= n - 8; i += 8)`
			`{`
			`result += (int)CV_POPCNT_U64((uint64)(a + i));`
			`}`
			`# endif`
			`for(; i <= n - 4; i += 4)`
			`{`
			`result += CV_POPCNT_U32((uint)(a + i));`
			`}`
			`}`
Updated AVX2 implementation of v_popcount for u8. 2019-05-14 23:48:36 +08:00			`#endif`
core(stat): move implementations into .hpp file w/o changes 2017-07-02 21:07:58 +08:00			`#if CV_ENABLE_UNROLLED`
			`for(; i <= n - 4; i += 4)`
			`{`
			`result += popCountTable[a[i]] + popCountTable[a[i+1]] +`
			`popCountTable[a[i+2]] + popCountTable[a[i+3]];`
			`}`
			`#endif`
			`for(; i < n; i++)`
			`{`
			`result += popCountTable[a[i]];`
			`}`
			`return result;`
			`}`

			`int normHamming(const uchar* a, const uchar* b, int n)`
			`{`
core(stat): add required CV_AVX_GUARD Added guard with 'vzeroupper' instruction 2017-07-02 23:33:05 +08:00			`CV_AVX_GUARD;`

core(stat): move implementations into .hpp file w/o changes 2017-07-02 21:07:58 +08:00			`int i = 0;`
			`int result = 0;`

Merge pull request #23980 from hanliutong:rewrite-core Rewrite Universal Intrinsic code by using new API: Core module. #23980 The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API. The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885. Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are: 1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited - ./modules/core/src/stat.simd.hpp - ./modules/core/src/matrix_transform.cpp - ./modules/core/src/matmul.simd.hpp 2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly. - ./modules/core/src/mathfuncs_core.simd.hpp ```cpp struct v_atan_f32 { explicit v_atan_f32(const float& scale) { ... } v_float32 compute(const v_float32& y, const v_float32& x) { ... } ... v_float32 val90; // sizeless type can not used in a class v_float32 val180; v_float32 val360; v_float32 s; }; ``` 3. The API interface does not support/does not match - ./modules/core/src/norm.cpp Use `v_popcount`, ~~waiting for #23966~~ Fixed - ./modules/core/src/has_non_zero.simd.hpp Use illegal Universal Intrinsic API: For float type, there is no logical operation `\|`. Further discussion needed ```cpp /** @brief Bitwise OR Only for integer types. / template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator\|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator\|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); ``` ```cpp #if CV_SIMD typedef v_float32 v_type; const v_type v_zero = vx_setzero_f32(); constexpr const int unrollCount = 8; int step = v_type::nlanes unrollCount; int len0 = len & -step; const float* srcSimdEnd = src+len0; int countSIMD = static_cast<int>((srcSimdEnd-src)/step); while(!res && countSIMD--) { v_type v0 = vx_load(src); src += v_type::nlanes; v_type v1 = vx_load(src); src += v_type::nlanes; .... src += v_type::nlanes; v0 \|= v1; //Illegal ? .... //res = v_check_any(((v0 \| v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ res = !v_check_all(((v0 \| v4) == v_zero)); } v_cleanup(); #endif ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake 2023-08-11 13:33:33 +08:00			`#if (CV_SIMD \|\| CV_SIMD_SCALABLE)`
Updated AVX2 implementation of v_popcount for u8. 2019-05-14 23:48:36 +08:00			`{`
			`v_uint64 t = vx_setzero_u64();`
Merge pull request #23980 from hanliutong:rewrite-core Rewrite Universal Intrinsic code by using new API: Core module. #23980 The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API. The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885. Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are: 1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited - ./modules/core/src/stat.simd.hpp - ./modules/core/src/matrix_transform.cpp - ./modules/core/src/matmul.simd.hpp 2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly. - ./modules/core/src/mathfuncs_core.simd.hpp ```cpp struct v_atan_f32 { explicit v_atan_f32(const float& scale) { ... } v_float32 compute(const v_float32& y, const v_float32& x) { ... } ... v_float32 val90; // sizeless type can not used in a class v_float32 val180; v_float32 val360; v_float32 s; }; ``` 3. The API interface does not support/does not match - ./modules/core/src/norm.cpp Use `v_popcount`, ~~waiting for #23966~~ Fixed - ./modules/core/src/has_non_zero.simd.hpp Use illegal Universal Intrinsic API: For float type, there is no logical operation `\|`. Further discussion needed ```cpp /** @brief Bitwise OR Only for integer types. / template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator\|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator\|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); ``` ```cpp #if CV_SIMD typedef v_float32 v_type; const v_type v_zero = vx_setzero_f32(); constexpr const int unrollCount = 8; int step = v_type::nlanes unrollCount; int len0 = len & -step; const float* srcSimdEnd = src+len0; int countSIMD = static_cast<int>((srcSimdEnd-src)/step); while(!res && countSIMD--) { v_type v0 = vx_load(src); src += v_type::nlanes; v_type v1 = vx_load(src); src += v_type::nlanes; .... src += v_type::nlanes; v0 \|= v1; //Illegal ? .... //res = v_check_any(((v0 \| v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ res = !v_check_all(((v0 \| v4) == v_zero)); } v_cleanup(); #endif ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake 2023-08-11 13:33:33 +08:00			`for (; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())`
			`t = v_add(t, v_popcount(v_reinterpret_as_u64(v_xor(vx_load(a + i), vx_load(b + i)))));`
Updated AVX2 implementation of v_popcount for u8. 2019-05-14 23:48:36 +08:00			`result += (int)v_reduce_sum(t);`
core(stat): move implementations into .hpp file w/o changes 2017-07-02 21:07:58 +08:00			`}`
Updated AVX2 implementation of v_popcount for u8. 2019-05-14 23:48:36 +08:00			`#endif`
core(stat): move implementations into .hpp file w/o changes 2017-07-02 21:07:58 +08:00
			`#if CV_POPCNT`
			`{`
			`# if defined CV_POPCNT_U64`
			`for(; i <= n - 8; i += 8)`
			`{`
			`result += (int)CV_POPCNT_U64((uint64)(a + i) ^ (uint64)(b + i));`
			`}`
			`# endif`
			`for(; i <= n - 4; i += 4)`
			`{`
			`result += CV_POPCNT_U32((uint)(a + i) ^ (uint)(b + i));`
			`}`
			`}`
Updated AVX2 implementation of v_popcount for u8. 2019-05-14 23:48:36 +08:00			`#endif`
core(stat): move implementations into .hpp file w/o changes 2017-07-02 21:07:58 +08:00			`#if CV_ENABLE_UNROLLED`
			`for(; i <= n - 4; i += 4)`
			`{`
			`result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +`
			`popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];`
			`}`
			`#endif`
			`for(; i < n; i++)`
			`{`
			`result += popCountTable[a[i] ^ b[i]];`
			`}`
			`return result;`
			`}`

			`#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY`

			`CV_CPU_OPTIMIZATION_NAMESPACE_END`
			`}} //cv::hal`